diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml index b4bb6d5..b4b832d 100644 --- a/.idea/DW_Pipeline_Test.iml +++ b/.idea/DW_Pipeline_Test.iml @@ -2,7 +2,7 @@ - + @@ -12,6 +12,8 @@ diff --git a/.idea/misc.xml b/.idea/misc.xml index 11f1ea0..baf04e9 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py index 20bf4c5..547b765 100644 --- a/Forums/Altenens/crawler_selenium.py +++ b/Forums/Altenens/crawler_selenium.py @@ -61,18 +61,16 @@ def opentor(): # Login using premade account credentials and do login captcha manually def login(driver): #click login button - login = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]').\ - get_attribute('href') - driver.get(login) - # login.click() - - # #entering username and password into input boxes - # usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[1]/dd') - # #Username here - # usernameBox.send_keys('mylittlepony45')#sends string to the username box - # passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[2]/dd/div/div') - # #Password here - # passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox + login = driver.find_element(by=By.XPATH, value='//*[@id="top"]/div[1]/div/div/div/div[1]/a[1]') + login.click() + + #entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="_xfUid-1-1688066635"]') + #Username here + usernameBox.send_keys('mylittlepony45')#sends string to the username box + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="_xfUid-2-1688066635"]') + #Password here + passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox input("Press ENTER when CAPTCHA is completed\n") diff --git a/MarketPlaces/AnonymousMarketplace/crawler_selenium.py b/MarketPlaces/AnonymousMarketplace/crawler_selenium.py index a37915c..dd35a69 100644 --- a/MarketPlaces/AnonymousMarketplace/crawler_selenium.py +++ b/MarketPlaces/AnonymousMarketplace/crawler_selenium.py @@ -25,8 +25,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.AnonymousMarketplace.parser import anonymous_links_parser from MarketPlaces.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/' @@ -52,6 +50,8 @@ def startCrawling(): # Opens Tor Browser #prompts for ENTER input to continue def opentor(): + from MarketPlaces.Initialization.markets_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -90,6 +90,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -154,12 +156,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import CURRENT_DATE + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\AnonymousMarketplace\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\AnonymousMarketplace\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/MarketPlaces/Apocalypse/crawler_selenium.py b/MarketPlaces/Apocalypse/crawler_selenium.py index d01814d..b46e4af 100644 --- a/MarketPlaces/Apocalypse/crawler_selenium.py +++ b/MarketPlaces/Apocalypse/crawler_selenium.py @@ -25,8 +25,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.Apocalypse.parser import apocalypse_links_parser from MarketPlaces.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/' @@ -52,6 +50,8 @@ def startCrawling(): # Opens Tor Browser #prompts for ENTER input to continue def opentor(): + from MarketPlaces.Initialization.markets_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -90,6 +90,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -170,12 +172,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import CURRENT_DATE + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\Apocalypse\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\Apocalypse\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/MarketPlaces/CityMarket/crawler_selenium.py b/MarketPlaces/CityMarket/crawler_selenium.py index c1c3e88..1384c18 100644 --- a/MarketPlaces/CityMarket/crawler_selenium.py +++ b/MarketPlaces/CityMarket/crawler_selenium.py @@ -26,8 +26,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.CityMarket.parser import city_links_parser from MarketPlaces.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/' @@ -53,6 +51,8 @@ def startCrawling(): # Opens Tor Browser #prompts for ENTER input to continue def opentor(): + from MarketPlaces.Initialization.markets_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -91,6 +91,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -167,12 +169,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import CURRENT_DATE + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\CityMarket\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\CityMarket\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/MarketPlaces/CypherMarketplace/crawler_selenium.py b/MarketPlaces/CypherMarketplace/crawler_selenium.py index 3b758ce..aa587c4 100644 --- a/MarketPlaces/CypherMarketplace/crawler_selenium.py +++ b/MarketPlaces/CypherMarketplace/crawler_selenium.py @@ -2,6 +2,7 @@ __author__ = 'Helium' ''' CypherMarketplace Forum Crawler (Selenium) +crawler done ''' from selenium import webdriver @@ -24,8 +25,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.CypherMarketplace.parser import cyphermarketplace_links_parser from MarketPlaces.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion/' @@ -51,6 +50,8 @@ def startCrawling(): # Opens Tor Browser #prompts for ENTER input to continue def opentor(): + from MarketPlaces.Initialization.markets_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -89,6 +90,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -167,12 +170,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import CURRENT_DATE + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\CypherMarketplace\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\CypherMarketplace\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/MarketPlaces/DarkFox/crawler_selenium.py b/MarketPlaces/DarkFox/crawler_selenium.py index 1539c3f..3967567 100644 --- a/MarketPlaces/DarkFox/crawler_selenium.py +++ b/MarketPlaces/DarkFox/crawler_selenium.py @@ -42,16 +42,17 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - new_parse(mktName, False) + new_parse(mktName, baseURL, False) # Opens Tor Browser #prompts for ENTER input to continue def opentor(): + from MarketPlaces.Initialization.markets_mining import config + global pid print("Connecting Tor...") - path = open('../../path.txt').readline().strip() - pro = subprocess.Popen(path) + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) pid = pro.pid time.sleep(7.5) input('Tor Connected. Press ENTER to continue\n') @@ -93,12 +94,11 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): - file = open('../../path.txt', 'r') - lines = file.readlines() + from MarketPlaces.Initialization.markets_mining import config - ff_binary = FirefoxBinary(lines[0].strip()) + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - ff_prof = FirefoxProfile(lines[1].strip()) + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) # ff_prof.set_preference("places.history.enabled", False) # ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) # ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) @@ -120,10 +120,11 @@ def createFFDriver(): ff_prof.set_preference("javascript.enabled", False) ff_prof.update_preferences() - service = Service(lines[2].strip()) + service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + return driver @@ -185,15 +186,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\DarkFox\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\DarkFox\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/MarketPlaces/DarkMatter/crawler_selenium.py b/MarketPlaces/DarkMatter/crawler_selenium.py index 0f85fea..21f6035 100644 --- a/MarketPlaces/DarkMatter/crawler_selenium.py +++ b/MarketPlaces/DarkMatter/crawler_selenium.py @@ -26,8 +26,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.DarkMatter.parser import darkmatter_links_parser from MarketPlaces.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/' @@ -53,6 +51,8 @@ def startCrawling(): # Opens Tor Browser #prompts for ENTER input to continue def opentor(): + from MarketPlaces.Initialization.markets_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -91,8 +91,11 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof.set_preference("places.history.enabled", False) ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) @@ -154,12 +157,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import CURRENT_DATE + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\DarkMatter\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\DarkMatter\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/MarketPlaces/DarkTor/crawler_selenium.py b/MarketPlaces/DarkTor/crawler_selenium.py index 98beb9b..c4cbfd4 100644 --- a/MarketPlaces/DarkTor/crawler_selenium.py +++ b/MarketPlaces/DarkTor/crawler_selenium.py @@ -24,8 +24,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.DarkTor.parser import darktor_links_parser from MarketPlaces.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/' @@ -51,6 +49,8 @@ def startCrawling(): # Opens Tor Browser #prompts for ENTER input to continue def opentor(): + from MarketPlaces.Initialization.markets_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -89,6 +89,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -153,12 +155,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import CURRENT_DATE + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\DarkTor\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\DarkTor\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/MarketPlaces/DigitalThriftShop/crawler_selenium.py b/MarketPlaces/DigitalThriftShop/crawler_selenium.py index 2da39ac..e602a4f 100644 --- a/MarketPlaces/DigitalThriftShop/crawler_selenium.py +++ b/MarketPlaces/DigitalThriftShop/crawler_selenium.py @@ -24,8 +24,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.DigitalThriftShop.parser import digitalthriftshop_links_parser from MarketPlaces.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/' @@ -89,6 +87,9 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -153,12 +154,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import CURRENT_DATE + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\DigitalThriftShop\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\DigitalThriftShop\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/MarketPlaces/Initialization/geckodriver.log b/MarketPlaces/Initialization/geckodriver.log index 8dc8c76..84cf0cd 100644 --- a/MarketPlaces/Initialization/geckodriver.log +++ b/MarketPlaces/Initialization/geckodriver.log @@ -10999,3 +10999,43 @@ unwatchForTargets()@TargetList.jsm:37 destructor()@TargetList.jsm:109 stop()@CDP.jsm:104 close()@RemoteAgent.jsm:138 +1688064782612 geckodriver INFO Listening on 127.0.0.1:53074 +1688064786507 mozrunner::runner INFO Running command: "C:\\Users\\\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "53075" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\Helium\\AppData\\Local\\Temp\\rust_mozprofilee4dopq" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1688064787228 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:53075/devtools/browser/f975a788-4cc3-4b93-8ff4-00ebb08433d1 +1688064788880 Marionette INFO Listening on port 52296 +1688064788914 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilee4dopq\thumbnails) because it does not exist +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +1688065043692 Marionette INFO Stopped listening on port 52296 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\Helium\AppData\Local\Temp\rust_mozprofilee4dopq\thumbnails) because it does not exist +[Parent 6908, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167 + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1688065044075 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 diff --git a/MarketPlaces/LionMarketplace/crawler_selenium.py b/MarketPlaces/LionMarketplace/crawler_selenium.py index 5212aad..3310aca 100644 --- a/MarketPlaces/LionMarketplace/crawler_selenium.py +++ b/MarketPlaces/LionMarketplace/crawler_selenium.py @@ -24,8 +24,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.LionMarketplace.parser import lionmarketplace_links_parser from MarketPlaces.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/' @@ -51,6 +49,8 @@ def startCrawling(): # Opens Tor Browser #prompts for ENTER input to continue def opentor(): + from MarketPlaces.Initialization.markets_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -89,6 +89,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -154,12 +156,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import CURRENT_DATE + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\LionMarketplace\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\LionMarketplace\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/MarketPlaces/M00nkeyMarket/crawler_selenium.py b/MarketPlaces/M00nkeyMarket/crawler_selenium.py index 92898dd..1a4776e 100644 --- a/MarketPlaces/M00nkeyMarket/crawler_selenium.py +++ b/MarketPlaces/M00nkeyMarket/crawler_selenium.py @@ -25,8 +25,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.M00nkeyMarket.parser import m00nkey_links_parser from MarketPlaces.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/' @@ -52,6 +50,8 @@ def startCrawling(): # Opens Tor Browser #prompts for ENTER input to continue def opentor(): + from MarketPlaces.Initialization.markets_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -90,6 +90,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -170,12 +172,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import CURRENT_DATE + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\M00nkeyMarket\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\M00nkeyMarket\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/MarketPlaces/MikesGrandStore/crawler_selenium.py b/MarketPlaces/MikesGrandStore/crawler_selenium.py index f93c682..c65dbb5 100644 --- a/MarketPlaces/MikesGrandStore/crawler_selenium.py +++ b/MarketPlaces/MikesGrandStore/crawler_selenium.py @@ -24,8 +24,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.MikesGrandStore.parser import mikesgrandstore_links_parser from MarketPlaces.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion/' @@ -51,6 +49,8 @@ def startCrawling(): # Opens Tor Browser #prompts for ENTER input to continue def opentor(): + from MarketPlaces.Initialization.markets_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -89,6 +89,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -153,12 +155,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import CURRENT_DATE + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\MikesGrandStore\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\MikesGrandStore\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath diff --git a/setup.ini b/setup.ini index fd4b09a..193994f 100644 --- a/setup.ini +++ b/setup.ini @@ -1,10 +1,11 @@ + [TOR] -firefox_binary_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\firefox.exe -firefox_profile_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default -geckodriver_path = C:\Users\calsyslab\Projects\dw_pipeline_test\selenium\geckodriver.exe +firefox_binary_path = C:\Users\\Helium\Desktop\Tor Browser\Browser\firefox.exe +firefox_profile_path = C:\Users\\Helium\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default +geckodriver_path = C:\Users\\Helium\\PycharmProjects\dw_pipeline_test\selenium\geckodriver.exe [Project] -project_directory = C:\Users\calsyslab\Projects\dw_pipeline_test +project_directory = C:\Users\Helium\\PycharmProjects\dw_pipeline_test shared_folder = \\VBoxSvr\Shared [PostgreSQL]