|
|
- __author__ = '91Shadows'
-
- '''
- DarkFox marketplace Crawler
- '''
-
- import codecs
- import socks, socket, time
- from datetime import date
- import urllib.parse as urlparse
- import http.client as httplib
- import mechanize
- import os
- import subprocess
- from bs4 import BeautifulSoup
- from MarketPlaces.Initialization.prepare_parser import new_parse
- from MarketPlaces.DarkFox.parser import darkfox_links_parser
-
- counter = 1
- httplib.HTTPConnection._http_vsn = 10
- httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
- baseURL = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
- socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)
-
-
- # Opens Tor Browser, crawls the mkt
- def startCrawling():
-
- opentor()
- getUrl()
- url = getFixedURL()
- mktName = getMKTName()
- credentials = getCredentials()
- br = getAccess(url, credentials)
-
- if br != 'down':
- crawlMkt(url, br)
- #new_parse(mktName, False)
-
- #new_parse(mktName, False)
-
- closetor()
-
-
- #Opens Tor Browser
- def opentor():
- global pid
- print("Connecting Tor...")
- path = open('../../path.txt').readline()
- pro = subprocess.Popen(path)
- pid = pro.pid
- time.sleep(5)
- input("Tor Connected. Press ENTER to continue\n")
- return
-
-
- # Creates a connection through Tor Port
- def getUrl(timeout=None):
- socket.socket = socks.socksocket
- socket.create_connection = create_connection
- return
-
-
- # Makes the onion address request
- def create_connection(address, timeout=None, source_address=None):
- sock = socks.socksocket()
- sock.connect(address)
- return sock
-
-
- # Returns the name of the mkt (Crypto)
- def getMKTName():
- name = 'DarkFox'
- return name
-
-
- # Returns credentials needed for the mkt
- def getCredentials():
- credentials = 'blank blank blank blank cap 0'
- return credentials
-
-
- # Return the link of the mkt (DarkFox Link)
- def getFixedURL():
- url = 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/'
- return url
-
-
- # Closes Tor Browser
- def closetor():
- global pid
- os.system("taskkill /pid " + str(pid))
- print('Closing Tor...')
- time.sleep(3)
- return
-
-
- # Creates a Mechanize browser and initializes its options
- def createBrowser():
- br = mechanize.Browser()
- cj = mechanize.CookieJar()
- br.set_cookiejar(cj)
-
- # Browser options
- br.set_handle_equiv( True )
- br.set_handle_redirect( True )
- br.set_handle_referer( True )
- br.set_handle_robots(False)
- br.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 )
-
- br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
- ('Accept', '*/*')]
-
- return br
-
-
- def getAccess(loginPage, credentials):
-
- logInName = credentials.split()[0]
- userName = credentials.split()[1]
- logInPass = credentials.split()[2]
- password = credentials.split()[3]
- captchaName = credentials.split()[4]
- formId = credentials.split()[5]
-
- br = createBrowser()
-
- try:
- keepTrying = True
- while (keepTrying):
-
- br.open(loginPage)
- time.sleep(7)
- html = br.response()
- soup = BeautifulSoup(html)
- image_tags = soup.findAll('div', {"class": "imgWrap"})
- captchaLink = image_tags[0]
- imagelink = captchaLink['style'].split('url(')[1][:-1]
- data = br.open(imagelink).read()
- br.back()
- open('captcha.png', "wb").write(data)
- '''
- subprocess.Popen("python capt.py", shell=False)
- time.sleep(61)
- captchaAnswerFile = open("answer.txt", "r")
- captchaAnswer = captchaAnswerFile.read().__str__()
- '''
- captchaAnswer = input('Please provide me with captcha : ')
- formIndex = int(formId)
- br.select_form(nr=formIndex)
- #br[logInName] = userName
- #br[logInPass] = password
- br[captchaName] = captchaAnswer.__str__()
- br.submit()
- if br.geturl() != 'http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/':
- keepTrying = False
-
- return br
-
- except:
-
- return 'down'
-
-
- # Saves the crawled html page
- def savePage(page, url):
- filePath = getFullPathName(url)
- os.makedirs(os.path.dirname(filePath), exist_ok=True)
- a = page.read()
- open(filePath, "wb").write(a)
- return
-
-
- # Gets the full path of the page to be saved along with its appropriate file name
- def getFullPathName(url):
- fileName = getNameFromURL(url)
- if isDescriptionLink(url):
- fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
- "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
- "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
- else:
- fullPath = r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
- "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
- "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
- return fullPath
-
-
- # Creates the name of the file based on URL
- def getNameFromURL(url):
- global counter
- name = ''.join(e for e in url if e.isalnum())
- if (name == ''):
- name = str(counter)
- counter = counter + 1
- return name
-
-
- # Hacking and Markets related topics
- def getInterestedLinks():
- links = []
-
- # Guides and Tutorials
- links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/30739153-1fcd-45cd-b919-072b439c6e06')
- # Digital Products
- links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781')
- # Software and Malware
- links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc')
- # Services
- links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280')
- # Miscellaneous
- links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/fd1c989b-1a74-4dc0-92b0-67d8c1c487cb')
- # Hosting and Security
- links.append('http://57d5j6hfzfpsfev6c7f5ltney5xahudevvttfmw4lrtkt42iqdrkxmqd.onion/category/5233fd6a-72e6-466d-b108-5cc61091cd14')
-
- # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing1.html')
- # links.append('file:///C:/PhD/Projects/DarkWebMining_Sample/MarketPlaces/Crypto/HTML_Pages/02162016/Listing/Listing2.html')
-
- return links
-
-
- def crawlMkt(url, br):
-
- print("Crawling the DarkFox marketplace")
-
- linksToCrawl = getInterestedLinks()
- visited = set(linksToCrawl)
- initialTime = time.time()
-
- i = 0
- while i < len(linksToCrawl):
- link = linksToCrawl[i]
- print('Crawling :', link)
- try :
- page = br.open(link)
- savePage(page, link)
- for l in br.links():
- absURL = urlparse.urljoin(l.base_url, l.url)
- if absURL not in visited and not isSignOut(absURL) and isListingLink(absURL):
- visited.add(absURL)
-
- #disabling the process of finding other links
- #linksToCrawl.append(absURL)
-
- # crawler asks parser to get links of ALL products on ALL listing pages
- list = productPages(link)
- j = 0
- for item in list:
- if j == 2:
- break
- #itemURL = baseURL + str(item)
- try:
- #itemPage = br.open(itemURL)
- itemPage = br.open(item)
- savePage(itemPage, item)
- except:
- #print 'Error in page: ', itemURL
- print('Error in page: ', item)
- j+=1
-
- except Exception as e:
- print(link, e.message)
- i += 1
-
- #finalTime = time.time()
- #print finalTime - initialTime
-
- input("Crawling DarkFox marketplace done successfully. Press ENTER to continue\n")
-
- return
-
-
- def isDescriptionLink(url):
- if 'product' in url:
- return True
- return False
-
-
- # Returns True if the link is a listingPage link
- def isListingLink(url):
- if 'category' in url:
- return True
- return False
-
-
- # calling the parser to define the links
- def productPages(url):
-
- soup = ""
-
- error = False
- try:
- html = codecs.open(
- r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
- "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
- "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html', encoding='utf8')
- soup = BeautifulSoup(html, "html.parser")
- except:
- try:
- html = open(
- r'C:\Users\calsyslab\Documents\CALSysLab\threatIntelligence-main\DarkWebMining_Sample\MarketPlaces\DarkFox\HTML_Pages\\' + str(
- "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
- "%04d" % date.today().year) + r'\Listing\\' + getNameFromURL(url) + '.html')
- soup = BeautifulSoup(html, "html.parser")
- except:
- error = True
- print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
-
- if error:
- return []
- else:
- return darkfox_links_parser(soup)
-
-
- # Drop links that "singout"
- def isSignOut(url):
- #absURL = urlparse.urljoin(url.base_url, url.url)
- if 'signout' in url.lower() or 'logout' in url.lower():
- return True
-
- return False
-
-
- def crawler():
- startCrawling()
- #print "Crawling and Parsing Crypto .... DONE!"
|