__author__ = '91Shadows'
|
|
|
|
'''
|
|
BestCardingWorld Crawler (Mechanize)
|
|
'''
|
|
|
|
import codecs, os, re
|
|
import socks, socket, time
|
|
from datetime import date
|
|
|
|
import urllib.parse as urlparse
|
|
import http.client as httplib
|
|
import mechanize
|
|
import subprocess
|
|
from bs4 import BeautifulSoup
|
|
from Forums.Initialization.prepare_parser import new_parse
|
|
from Forums.BestCardingWorld.parser import bestcardingworld_links_parser
|
|
|
|
counter = 1
|
|
httplib.HTTPConnection._http_vsn = 10
|
|
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
|
|
baseURL = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
|
|
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)
|
|
|
|
|
|
# Opens Tor Browser, crawls the website
|
|
def startCrawling():
|
|
opentor()
|
|
getUrl()
|
|
forumName = getForumName()
|
|
br = getAccess()
|
|
|
|
if br != 'down':
|
|
crawlForum(br)
|
|
new_parse(forumName, False)
|
|
|
|
# new_parse(forumName, False)
|
|
|
|
closetor()
|
|
|
|
|
|
# Opens Tor Browser
|
|
def opentor():
|
|
global pid
|
|
print("Connecting Tor...")
|
|
path = open('../../path.txt').readline()
|
|
pro = subprocess.Popen(path)
|
|
pid = pro.pid
|
|
time.sleep(7.5)
|
|
input("Tor Connected. Press ENTER to continue\n")
|
|
return
|
|
|
|
|
|
# Creates a connection through Tor Port
|
|
def getUrl(timeout=None):
|
|
socket.socket = socks.socksocket
|
|
socket.create_connection = create_connection
|
|
return
|
|
|
|
|
|
# Makes the onion address request
|
|
def create_connection(address, timeout=None, source_address=None):
|
|
sock = socks.socksocket()
|
|
sock.connect(address)
|
|
return sock
|
|
|
|
|
|
# Returns the name of website
|
|
def getForumName():
|
|
name = 'BestCardingWorld'
|
|
return name
|
|
|
|
|
|
# Return the link of website
|
|
def getFixedURL():
|
|
url = 'http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=42&sid=ee2cbfd73c12923d979790b2bb4bdfd5'
|
|
|
|
return url
|
|
|
|
|
|
# Closes Tor Browser
|
|
def closetor():
|
|
global pid
|
|
os.system("taskkill /pid " + str(pid))
|
|
print('Closing Tor...')
|
|
time.sleep(3)
|
|
return
|
|
|
|
|
|
# Creates a Mechanize browser and initializes its options
|
|
def createBrowser():
|
|
br = mechanize.Browser()
|
|
cj = mechanize.CookieJar()
|
|
br.set_cookiejar(cj)
|
|
|
|
# Browser options
|
|
br.set_handle_equiv(True)
|
|
br.set_handle_redirect(True)
|
|
br.set_handle_referer(True)
|
|
br.set_handle_robots(False)
|
|
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
|
|
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'),
|
|
('Accept', '*/*')]
|
|
|
|
return br
|
|
|
|
|
|
def getAccess():
|
|
url = getFixedURL()
|
|
br = createBrowser()
|
|
|
|
try:
|
|
|
|
br.open(url)
|
|
return br
|
|
|
|
except:
|
|
|
|
return 'down'
|
|
|
|
|
|
# Saves the crawled html page
|
|
def savePage(page, url):
|
|
filePath = getFullPathName(url)
|
|
os.makedirs(os.path.dirname(filePath), exist_ok=True)
|
|
a = page.read()
|
|
open(filePath, "wb").write(a)
|
|
return
|
|
|
|
|
|
# Gets the full path of the page to be saved along with its appropriate file name
|
|
def getFullPathName(url):
|
|
fileName = getNameFromURL(url)
|
|
if isDescriptionLink(url):
|
|
fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/BestCardingWorld/HTML_Pages/' + str(
|
|
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
|
|
"%04d" % date.today().year) + '/' + 'Description/' + fileName + '.html'
|
|
else:
|
|
fullPath = 'C:/Users/CALSysLab/Documents/threatIntelligence-main/DarkWebMining_Working/Forums/BestCardingWorld/HTML_Pages/' + str(
|
|
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
|
|
"%04d" % date.today().year) + '/' + 'Listing/' + fileName + '.html'
|
|
return fullPath
|
|
|
|
|
|
# Creates the name of the file based on URL
|
|
def getNameFromURL(url):
|
|
global counter
|
|
name = ''.join(e for e in url if e.isalnum())
|
|
if (name == ''):
|
|
name = str(counter)
|
|
counter = counter + 1
|
|
return name
|
|
|
|
|
|
# Hacking and Markets related topics
|
|
def getInterestedLinks():
|
|
links = []
|
|
|
|
links.append('http://bestteermb42clir6ux7xm76d4jjodh3fpahjqgbddbmfrgp4skg2wqd.onion/viewforum.php?f=43&sid=e12864ffccc5df877b03b573534955be')
|
|
|
|
return links
|
|
|
|
|
|
# Start crawling Forum pages
|
|
def crawlForum(br):
|
|
print("Crawling The Best Carding World forum")
|
|
|
|
linksToCrawl = getInterestedLinks()
|
|
visited = set(linksToCrawl)
|
|
initialTime = time.time()
|
|
|
|
|
|
i = 0
|
|
while i < len(linksToCrawl):
|
|
link = linksToCrawl[i]
|
|
print('Crawling :', link)
|
|
try:
|
|
page = br.open(link)
|
|
savePage(page, link)
|
|
|
|
res = br.response().read()
|
|
soup = BeautifulSoup(res, 'html.parser')
|
|
|
|
next_link = soup.find("a", {"rel": "next"})
|
|
if next_link != None:
|
|
full_url = urlparse.urljoin(linksToCrawl[i], next_link['href'])
|
|
linksToCrawl.insert(i + 1, full_url)
|
|
|
|
listOfTopics = findDescriptionPages(link)
|
|
for topic in listOfTopics:
|
|
itemPage = br.open(str(topic))
|
|
savePage(itemPage, topic)
|
|
|
|
except Exception as e:
|
|
print('Error getting link: ', link, e)
|
|
i += 1
|
|
|
|
# finalTime = time.time()
|
|
# print finalTime - initialTime
|
|
|
|
input("Crawling Best Carding world forum done sucessfully. Press ENTER to continue\n")
|
|
|
|
return
|
|
|
|
|
|
# Returns True if the link is 'Topic' Links
|
|
def isDescriptionLink(url):
|
|
if 'topic' in url:
|
|
return True
|
|
return False
|
|
|
|
|
|
# Returns True if the link is a listingPage link
|
|
def isListingLink(url):
|
|
'''
|
|
reg = 'board=[0-9]+.[0-9]+\Z'
|
|
if len(re.findall(reg, url)) == 0:
|
|
return False
|
|
return True
|
|
'''
|
|
if 'forum' in url:
|
|
return True
|
|
return False
|
|
|
|
|
|
# calling the parser to define the links
|
|
def findDescriptionPages(url):
|
|
soup = ""
|
|
|
|
error = False
|
|
try:
|
|
html = codecs.open(
|
|
"C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(
|
|
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
|
|
"%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html", encoding='utf8')
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
except:
|
|
try:
|
|
html = open(
|
|
"C:\\Users\\CALSysLab\\Documents\\threatIntelligence-main\\DarkWebMining_Working\\Forums\\BestCardingWorld\\HTML_Pages\\" + str(
|
|
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
|
|
"%04d" % date.today().year) + "\\Listing\\" + getNameFromURL(url) + ".html")
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
except:
|
|
error = True
|
|
print("There was a problem to read the file " + getNameFromURL(url) + " in the listing section.")
|
|
|
|
if not error:
|
|
return bestcardingworld_links_parser(soup)
|
|
|
|
else:
|
|
return []
|
|
|
|
|
|
def crawler():
|
|
startCrawling()
|
|
print("Crawling and Parsing The Best Carding World .... DONE!")
|