diff --git a/MarketPlaces/Kerberos/crawler_selenium.py b/MarketPlaces/Kerberos/crawler_selenium.py index d199c73..354a5bf 100644 --- a/MarketPlaces/Kerberos/crawler_selenium.py +++ b/MarketPlaces/Kerberos/crawler_selenium.py @@ -2,6 +2,8 @@ __author__ = 'Helium' ''' Kerberos Market Crawler (Selenium) + +able to catch crawlers ''' from selenium import webdriver @@ -42,21 +44,9 @@ def startCrawling(): # new_parse(mktName, baseURL, True) - -# Opens Tor Browser -def opentor(): - global pid - print("Connecting Tor...") - path = open('../../path.txt').readline().strip() - pro = subprocess.Popen(path) - pid = pro.pid - time.sleep(7.5) - input('Tor Connected. Press ENTER to continue\n') - return - def captcha(driver): # do captchas manually and then wait - input('Complete CAPTCHA\'s manually\nthen press enter when completed') + input('Complete CAPTCHA\'s manually then press enter when completed') def closeDriver(driver): # global pid @@ -129,7 +119,6 @@ def getMKTName(): # Return the link of the website def getFixedURL(): url = 'http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion' - return url @@ -159,8 +148,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -195,8 +184,8 @@ def getAccess(): # Saves the crawled html page -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -205,15 +194,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\Kerberos\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\Kerberos\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath @@ -232,22 +220,22 @@ def getInterestedLinks(): # Services - Hacking links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/99/block/price-none/ww/ww/1/') - # Tutorials - Hacking - links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/122/block/price-none/ww/ww/1/') - # Tutorials - Guides - links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/124/block/price-none/ww/ww/1/') - # Tutorials - Other - links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/126/block/price-none/ww/ww/1/') - # Software and Malware - Botnets - links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/129/block/price-none/ww/ww/1/') - # Software and Malware - Malware - links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/130/block/price-none/ww/ww/1/') - # Software and Malware - Trojans - links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/131/block/price-none/ww/ww/1/') - # Software and Malware - Exploits / Kits - links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/133/block/price-none/ww/ww/1/') - # Software and Malware - Other - links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/136/block/price-none/ww/ww/1/') + # # Tutorials - Hacking + # links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/122/block/price-none/ww/ww/1/') + # # Tutorials - Guides + # links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/124/block/price-none/ww/ww/1/') + # # Tutorials - Other + # links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/126/block/price-none/ww/ww/1/') + # # Software and Malware - Botnets + # links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/129/block/price-none/ww/ww/1/') + # # Software and Malware - Malware + # links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/130/block/price-none/ww/ww/1/') + # # Software and Malware - Trojans + # links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/131/block/price-none/ww/ww/1/') + # # Software and Malware - Exploits / Kits + # links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/133/block/price-none/ww/ww/1/') + # # Software and Malware - Other + # links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/136/block/price-none/ww/ww/1/') return links @@ -255,57 +243,114 @@ def getInterestedLinks(): def crawlForum(driver): print("Crawling the Kerberos market") + # linksToCrawl = getInterestedLinks() + # visited = set(linksToCrawl) + # initialTime = time.time() + # + # i = 0 + # count = 0 + # while i < len(linksToCrawl): + # link = linksToCrawl[i] + # print('Crawling :', link) + # + # try: + # try: + # driver.get(link) + # except: + # driver.refresh() + # html = driver.page_source + # savePage(driver, html, link) + # + # has_next_page = True + # while has_next_page: + # list = productPages(html) + # for item in list: + # itemURL = urlparse.urljoin(baseURL, str(item)) + # try: + # driver.get(itemURL) + # except: + # driver.refresh() + # savePage(driver,driver.page_source, item) + # driver.back() + # # break + # + # if count == 1: + # count = 0 + # break + # + # try: + # # /html/body/div[4]/div[4]/div[4]/div/div[1]/div[28]/a[15] + # # /html/body/div[4]/div[4]/div[4]/div/div[1]/div[28]/a[16] + # nav = driver.find_element(by=By.XPATH, value= + # '/html/body/div[3]/div[4]/div[4]/div/div[1]/div[28]') + # a = nav.find_element(by=By.LINK_TEXT, value="Next") + # link = a.get_attribute('href') + # + # if link == "": + # raise NoSuchElementException + # try: + # driver.get(link) + # except: + # driver.refresh() + # html = driver.page_source + # savePage(driver, html, link) + # count += 1 + # + # except NoSuchElementException: + # has_next_page = False + # + # except Exception as e: + # print(link, e) + # i += 1 + linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(driver, html, link) list = productPages(html) + for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() + time.sleep(5) + + # comment out # break + # comment out if count == 1: - count = 0 break try: # /html/body/div[4]/div[4]/div[4]/div/div[1]/div[28]/a[15] - # /html/body/div[4]/div[4]/div[4]/div/div[1]/div[28]/a[16] + # /html/body/div[4]/div[4]/div[4]/div/div[1]/div[28]/a[15] + # /html/body/div[4]/div[4]/div[4]/div/div[1]/div[28]/a[15] + # /html/body/div[4]/div[4]/div[4]/div/div[1]/div[28]/a[3] nav = driver.find_element(by=By.XPATH, value= - '/html/body/div[3]/div[4]/div[4]/div/div[1]/div[28]') + '/html/body/div[4]/div[4]/div[4]/div/div[1]/div[28]') a = nav.find_element(by=By.LINK_TEXT, value="Next") link = a.get_attribute('href') - if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -315,9 +360,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling Kerberos market done sucessfully. Press ENTER to continue\n")