Browse Source

Added incomplete MGM crawler manually due to issues with vm

main
LynnTaka 8 months ago
parent
commit
c996543748
1 changed files with 277 additions and 0 deletions
  1. +277
    -0
      MarketPlaces/MGMGrand/crawler_selenium.py

+ 277
- 0
MarketPlaces/MGMGrand/crawler_selenium.py View File

@ -0,0 +1,277 @@
__author__ = 'DarkWeb'
'''
MGMGrand marketplace Crawler (Selenium)
'''
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
import urllib.parse as urlparse
import os, re, time
from datetime import date
import subprocess
import configparser
from bs4 import BeautifulSoup
from MarketPlaces.Initialization.prepare_parser import new_parse
from MarketPlaces.MGMGrand.parser import mgm_links_parser
from MarketPlaces.Utilities.utilities import cleanHTML
counter = 1
baseURL = 'http://duysanjqxo4svh35yqkxxe5r54z2xc5tjf6r3ichxd3m2rwcgabf44ad.onion'
def startCrawling():
mktName = getMKTName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closeDriver(driver)
new_parse(mktName, baseURL, True)
# Returns the name of the website
def getMKTName():
name = 'MGMGrand'
return name
# Return the base link of the website
def getFixedURL():
url = 'http://duysanjqxo4svh35yqkxxe5r54z2xc5tjf6r3ichxd3m2rwcgabf44ad.onion'
return url
# Closes Tor Browser
def closeDriver(driver):
# global pid
# os.system("taskkill /pid " + str(pro.pid))
# os.system("taskkill /t /f /im tor.exe")
print('Closing Tor...')
driver.close()
time.sleep(3)
return
# Creates FireFox 'driver' and configure its 'Profile'
# to use Tor proxy and socket
def createFFDriver():
from MarketPlaces.Initialization.markets_mining import config
ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
ff_prof.set_preference("places.history.enabled", False)
ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
ff_prof.set_preference("signon.rememberSignons", False)
ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
ff_prof.set_preference("network.dns.disablePrefetch", True)
ff_prof.set_preference("network.http.sendRefererHeader", 0)
ff_prof.set_preference("permissions.default.image", 3)
ff_prof.set_preference("browser.download.folderList", 2)
ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
ff_prof.set_preference('network.proxy.type', 1)
ff_prof.set_preference("network.proxy.socks_version", 5)
ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
ff_prof.set_preference('network.proxy.socks_port', 9150)
ff_prof.set_preference('network.proxy.socks_remote_dns', True)
ff_prof.set_preference("javascript.enabled", False)
ff_prof.update_preferences()
service = Service(config.get('TOR', 'geckodriver_path'))
driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
driver.maximize_window()
return driver
#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down'
def getAccess():
url = getFixedURL()
driver = createFFDriver()
try:
driver.get(url)
return driver
except:
driver.close()
return 'down'
def login(driver):
WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
(By.XPATH, '//*[@id="username"]')))
# entering username and password into input boxes
usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]')
# Username here
usernameBox.send_keys('blabri')
passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]')
# Password here
passwordBox.send_keys('fishowal')
input("Press ENTER when captcha is solved and you're logged in")
# Wait for the element to be visible
WebDriverWait(driver, 100).until(
EC.visibility_of_element_located((By.XPATH, '/html/body/div[2]/div[2]/a/span'))
)
# Find the element and click it
temp = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/a/span')
temp.click()
def savePage(driver, page, url):
cleanPage = cleanHTML(driver, page)
filePath = getFullPathName(url)
os.makedirs(os.path.dirname(filePath), exist_ok=True)
open(filePath, 'wb').write(cleanPage.encode('utf-8'))
return
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
def getMKTName() -> str:
name = 'MGMGrand'
return name
def getNameFromURL(url):
global counter
name = ''.join(e for e in url if e.isalnum())
if name == '':
name = str(counter)
counter = counter + 1
return name
def getInterestedLinks():
links = []
# Carding
links.append('http://duysanjqxo4svh35yqkxxe5r54z2xc5tjf6r3ichxd3m2rwcgabf44ad.onion/category/c54f4d30-9060-11eb-be9c-630550815967')
# # softwares and malwares
# links.append('http://duysanjqxo4svh35yqkxxe5r54z2xc5tjf6r3ichxd3m2rwcgabf44ad.onion/category/6a211fa0-9062-11eb-b3bd-d7d946c69ce2')
# # social engineering tutorial
# links.append('http://duysanjqxo4svh35yqkxxe5r54z2xc5tjf6r3ichxd3m2rwcgabf44ad.onion/category/5596f690-9063-11eb-9ebb-6b44dbdbc454')
# # misc services
# links.append('http://duysanjqxo4svh35yqkxxe5r54z2xc5tjf6r3ichxd3m2rwcgabf44ad.onion/category/b212fe30-9063-11eb-a479-35581612f6c2')
# # hacking tutorials
# links.append('http://duysanjqxo4svh35yqkxxe5r54z2xc5tjf6r3ichxd3m2rwcgabf44ad.onion/category/2ed713f0-9063-11eb-bf9c-232fd3b49a98')
return links
def crawlForum(driver):
print("Crawling the MGM market")
linksToCrawl = getInterestedLinks()
i = 0
while i < len(linksToCrawl):
link = linksToCrawl[i]
print('Crawling :', link)
try:
has_next_page = True
count = 0
while has_next_page:
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(driver, html, link)
list = productPages(html)
for item in list:
itemURL = urlparse.urljoin(baseURL, str(item))
try:
driver.get(itemURL)
except:
driver.refresh()
savePage(driver, driver.page_source, item)
driver.back()
# # comment out
break
#
# # comment out
# if count == 4:
# break
try:
li_tags = driver.find_element(by=By.CLASS_NAME, value='pagination mb-0').find_elements(by=By.TAG_NAME, value='li')
a_tag = li_tags[-1].find_element(by=By.TAG_NAME, value='a')
if a_tag:
try:
link = a_tag.get_attribute('href')
print(link)
except:
link = ''
if link == "":
raise NoSuchElementException
count += 1
except NoSuchElementException:
has_next_page = False
except Exception as e:
print(link, e)
i += 1
print("Crawling the MGMGrand market done.")
# Returns 'True' if the link is Topic link, may need to change for every website
def isDescriptionLink(url):
if 'product' in url:
return True
return False
# Returns True if the link is a listingPage link, may need to change for every website
def isListingLink(url):
if 'category' in url:
return True
return False
def productPages(html):
soup = BeautifulSoup(html, "html.parser")
return mgm_links_parser(soup)
def crawler():
startCrawling()

Loading…
Cancel
Save