Browse Source

save html to shared folder rest of markets: tor2door, torbay, and tormarket

main
westernmeadow 1 year ago
parent
commit
30b1ab8bda
3 changed files with 27 additions and 18 deletions
  1. +9
    -5
      MarketPlaces/Tor2door/crawler_selenium.py
  2. +9
    -8
      MarketPlaces/TorBay/crawler_selenium.py
  3. +9
    -5
      MarketPlaces/TorMarket/crawler_selenium.py

+ 9
- 5
MarketPlaces/Tor2door/crawler_selenium.py View File

@ -23,8 +23,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.Tor2door.parser import tor2door_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1
baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion'
@ -48,6 +46,8 @@ def startCrawling():
# Opens Tor Browser
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
@ -131,6 +131,8 @@ def closetor(driver):
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -184,12 +186,14 @@ def savePage(page, url):
# Gets the full path of the page to be saved along with its appropriate file name
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import CURRENT_DATE
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'..\Tor2door\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = r'..\Tor2door\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath


+ 9
- 8
MarketPlaces/TorBay/crawler_selenium.py View File

@ -25,8 +25,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.TorBay.parser import torbay_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1
baseURL = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion/'
@ -52,6 +50,8 @@ def startCrawling():
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
@ -90,6 +90,8 @@ def closetor(driver):
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -154,15 +156,14 @@ def savePage(page, url):
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'..\TorBay\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = r'..\TorBay\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath


+ 9
- 5
MarketPlaces/TorMarket/crawler_selenium.py View File

@ -24,8 +24,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.TorMarket.parser import tormarket_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
config = configparser.ConfigParser()
config.read('../../setup.ini')
counter = 1
baseURL = 'http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/'
@ -51,6 +49,8 @@ def startCrawling():
# Opens Tor Browser
#prompts for ENTER input to continue
def opentor():
from MarketPlaces.Initialization.markets_mining import config
global pid
print("Connecting Tor...")
pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
@ -89,6 +89,8 @@ def closetor(driver):
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
@ -153,12 +155,14 @@ def savePage(page, url):
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import CURRENT_DATE
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'..\TorMarket\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = r'..\TorMarket\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath


Loading…
Cancel
Save