added Nulled forum and Kerberos market (both unfinished)

1 year ago · eb85eeb102
--- a/Forums/Nulled/crawler_selenium.py
+++ b/Forums/Nulled/crawler_selenium.py
@ -0,0 +1,281 @@
 __author__ = 'DarkWeb'
 '''
 nulled Forum Crawler (Selenium)
 '''
 from selenium import webdriver
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
 from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 from selenium.webdriver.firefox.service import Service
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 from PIL import Image
 import base64
 from io import BytesIO
 import urllib.parse as urlparse
 import os, re, time
 from datetime import date
 import subprocess
 from bs4 import BeautifulSoup
 from Forums.Initialization.prepare_parser import new_parse
 from Forums.Nulled.parser import nulled_links_parser
 from Forums.Utilities.utilities import cleanHTML
 counter = 1
 baseURL = 'https://www.nulled.to'
 # Opens Tor Browser, crawls the website
 def startCrawling():
    # opentor()
    forumName = getForumName()
    # driver = getAccess()
    #
    # if driver != 'down':
    #     login(driver)
    #     crawlForum(driver)
    #     closetor(driver)
    new_parse(forumName, False)
 # Opens Tor Browser
 def opentor():
    global pid
    print("Connecting Tor...")
    path = open('../../path.txt').readline().strip()
    pro = subprocess.Popen(path)
    pid = pro.pid
    time.sleep(7.5)
    input('Tor Connected. Press ENTER to continue\n')
    return
 # Login using premade account credentials and do login captcha manually
 def login(driver):
    time.sleep(3)
 # Returns the name of the website
 def getForumName():
    name = 'Nulled'
    return name
 # Return the link of the website
 def getFixedURL():
    url = 'https://www.nulled.to'
    return url
 # Closes Tor Browser
 def closetor(driver):
    global pid
    # os.system("taskkill /pid " + str(pro.pid))
    os.system("taskkill /t /f /im tor.exe")
    print('Closing Tor...')
    driver.close()
    time.sleep(3)
    return
 # Creates FireFox 'driver' and configure its 'Profile'
 # to use Tor proxy and socket
 def createFFDriver():
    file = open('../../path.txt', 'r')
    lines = file.readlines()
    ff_binary = FirefoxBinary(lines[0].strip())
    ff_prof = FirefoxProfile(lines[1].strip())
    # ff_prof.set_preference("places.history.enabled", False)
    # ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
    # ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
    # ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
    # ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
    # ff_prof.set_preference("signon.rememberSignons", False)
    # ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    # ff_prof.set_preference("network.dns.disablePrefetch", True)
    # ff_prof.set_preference("network.http.sendRefererHeader", 0)
    # ff_prof.set_preference("permissions.default.image", 3)
    # ff_prof.set_preference("browser.download.folderList", 2)
    # ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
    # ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
    ff_prof.set_preference('network.proxy.type', 1)
    ff_prof.set_preference("network.proxy.socks_version", 5)
    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
    ff_prof.set_preference('network.proxy.socks_port', 9150)
    ff_prof.set_preference('network.proxy.socks_remote_dns', True)
    ff_prof.set_preference("javascript.enabled", True)
    ff_prof.update_preferences()
    service = Service(lines[2].strip())
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
    return driver
 def getAccess():
    url = getFixedURL()
    driver = createFFDriver()
    try:
        driver.get(url)
        # time.sleep(3)
        return driver
    except:
        return 'down'
 # Saves the crawled html page
 def savePage(page, url):
    cleanPage = cleanHTML(page)
    filePath = getFullPathName(url)
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    open(filePath, 'wb').write(cleanPage.encode('utf-8'))
    return
 # Gets the full path of the page to be saved along with its appropriate file name
 def getFullPathName(url):
    fileName = getNameFromURL(url)
    if isDescriptionLink(url):
        fullPath = r'..\Nulled\HTML_Pages\\' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
    else:
        fullPath = r'..\Nulled\HTML_Pages\\' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
    return fullPath
 # Creates the file name from passed URL
 def getNameFromURL(url):
    global counter
    name = ''.join(e for e in url if e.isalnum())
    if (name == ''):
        name = str(counter)
        counter = counter + 1
    return name
 def getInterestedLinks():
    links = []
    # Cracking Tools
    links.append('https://www.nulled.to/forum/90-cracking-tools/')
    # # Cracking Tutorials
    # links.append('https://www.nulled.to/forum/98-cracking-tutorials/')
    # # Releases
    # links.append('https://www.nulled.to/forum/209-releases/')
    # # Newbi.Net Frameworkse
    # links.append('https://www.nulled.to/forum/51-net-framework/')
    # # html css js php
    # links.append('https://www.nulled.to/forum/54-html-css-js-php/')
    # # C C++
    # links.append('https://www.nulled.to/forum/52-cc/')
    # # other languages
    # links.append('https://www.nulled.to/forum/135-other-languages/')
    return links
 def crawlForum(driver):
    print("Crawling the Nulled forum")
    linksToCrawl = getInterestedLinks()
    visited = set(linksToCrawl)
    initialTime = time.time()
    i = 0
    count = 0
    while i < len(linksToCrawl):
        link = linksToCrawl[i]
        print('Crawling :', link)
        try:
            try:
                driver.get(link)
            except:
                driver.refresh()
            html = driver.page_source
            savePage(html, link)
            has_next_page = True
            while has_next_page:
                list = topicPages(html)
                for item in list:
                    itemURL = urlparse.urljoin(baseURL, str(item))
                    try:
                        driver.get(itemURL)
                    except:
                        driver.refresh()
                    savePage(driver.page_source, item)
                    driver.back()
                    break
                if count == 1:
                    count = 0
                    break
                try:
                    temp = driver.find_element(by=By.XPATH, value='/html/body/div[4]/div[3]/div/div[3]/div[4]')
                    temp = temp.find_element(by=By.CLASS_NAME, value='pagination')
                    link = temp.find_element(by=By.CLASS_NAME, value='next')
                    link = link.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    try:
                        driver.get(link)
                    except:
                        driver.refresh()
                    html = driver.page_source
                    savePage(html, link)
                    count += 1
                except NoSuchElementException:
                    has_next_page = False
        except Exception as e:
            print(link, e.message)
        i += 1
    # finalTime = time.time()
    # print finalTime - initialTime
    input("Crawling Nulled forum done sucessfully. Press ENTER to continue\n")
 # Returns 'True' if the link is Topic link
 def isDescriptionLink(url):
    if 'topic/' in url:
        return True
    return False
 # Returns True if the link is a listingPage link
 def isListingLink(url):
    if 'forum/' in url:
        return True
    return False
 # calling the parser to define the links
 def topicPages(html):
    soup = BeautifulSoup(html, "html.parser")
    #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
    return nulled_links_parser(soup)
 def crawler():
    startCrawling()
    # print("Crawling and Parsing BestCardingWorld .... DONE!")
--- a/Forums/Nulled/parser.py
+++ b/Forums/Nulled/parser.py
@ -0,0 +1,355 @@
 __author__ = 'DarkWeb'
 # Here, we are importing the auxiliary functions to clean or convert data
 from Forums.Utilities.utilities import *
 from datetime import date
 from datetime import timedelta
 import re
 # Here, we are importing BeautifulSoup to search through the HTML tree
 from bs4 import BeautifulSoup
 # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
 def nulled_description_parser(soup):
    # Fields to be parsed
    topic = "-1"           # topic name
    user = []              # all users of each post
    addDate = []           # all dated of each post
    feedback = []          # all feedbacks of each vendor (this was found in just one Forum and with a number format)
    status = []            # all user's authority in each post such as (adm, member, dangerous)
    reputation = []        # all users's karma in each post (usually found as a number)
    sign = []              # all user's signature in each post (usually a standard message after the content of the post)
    post = []              # all messages of each post
    interest = []          # all user's interest in each post
    # Finding the topic (should be just one coming from the Listing Page)
    li = soup.find("td", {"class": "thead"}).find('strong')
    topic = li.text
    topic = re.sub("\[\w*\]", '', topic)
    topic = topic.replace(",","")
    topic = topic.replace("\n","")
    topic = cleanString(topic.strip())
    print(topic)
    # Finding the repeated tag that corresponds to the listing of posts
    # posts = soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg"}) + \
    #         soup.find("form", {"name": "quickModForm"}).findAll('div', {"class": "windowbg2"})
    try:
        posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
            'div', {"class": "post"})
        # print(len(posts))
        # For each message (post), get all the fields we are interested to:
        for ipost in posts:
            # Finding a first level of the HTML page
            # post_wrapper = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "poster"})
            post_wrapper = ipost.find('span', {"class": "largetext"})
            # Finding the author (user) of the post
            # author = post_wrapper.find('h4')
            author = post_wrapper.text.strip()
            # print("author " + author)
            user.append(cleanString(author))  # Remember to clean the problematic characters
            # Finding the status of the author
            smalltext = ipost.find('div', {"class": "post_author"})
            # Testing here two possibilities to find this status and combine them
            if ipost.find('div', {"class": "deleted_post_author"}):
                status.append(-1)
                interest.append(-1)
                reputation.append(-1)
                addDate.append(-1)
                post.append("THIS POST HAS BEEN REMOVED!")
                sign.append(-1)
                feedback.append(-1)
                continue
            # nulled does have membergroup and postgroup
            membergroup = smalltext.find('div', {"class": "profile-rank"})
            postgroup = smalltext.find('div', {"class": "postgroup"})
            if membergroup != None:
                membergroup = membergroup.text.strip()
                if postgroup != None:
                    postgroup = postgroup.text.strip()
                    membergroup = membergroup + " - " + postgroup
            else:
                if postgroup != None:
                    membergroup = postgroup.text.strip()
                else:
                    membergroup = "-1"
            status.append(cleanString(membergroup))
            # print("status " + cleanString(membergroup))
            # Finding the interest of the author
            # CryptBB does not have blurb
            blurb = smalltext.find('li', {"class": "blurb"})
            if blurb != None:
                blurb = blurb.text.strip()
            else:
                blurb = "-1"
            interest.append(cleanString(blurb))
            # Finding the reputation of the user
            # CryptBB does have reputation
            author_stats = smalltext.find('div', {"class": "author_statistics"})
            karma = author_stats.find('strong')
            if karma != None:
                karma = karma.text
                karma = karma.replace("Community Rating: ", "")
                karma = karma.replace("Karma: ", "")
                karma = karma.strip()
            else:
                karma = "-1"
            reputation.append(cleanString(karma))
            # print("karma " + cleanString(karma))
            # Getting here another good tag to find the post date, post content and users' signature
            postarea = ipost.find('div', {"class": "post_content"})
            dt = postarea.find('span', {"class": "post_date"}).text
            # dt = dt.strip().split()
            dt = dt.strip()
            day=date.today()
            if "Yesterday" in dt:
                yesterday = day - timedelta(days=1)
                yesterday = yesterday.strftime('%m-%d-%Y')
                stime = dt.replace('Yesterday,','').strip()
                date_time_obj = yesterday+ ', '+stime
                date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
            elif "hours ago" in dt:
                day = day.strftime('%m-%d-%Y')
                date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
                date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
            else:
                date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
                stime = date_time_obj.strftime('%b %d, %Y')
                sdate = date_time_obj.strftime('%I:%M %p')
            addDate.append(date_time_obj)
            # print("date " + str(date_time_obj))
            # Finding the date of the post
            # date_time_obj = datetime.strptime(dt, '%a %b %d, %Y %I:%M %p')
            # smalltext = postarea.find('div', {"class": "flow_hidden"}).find('div', {"class": "keyinfo"})\
            #     .find('div', {"class": "smalltext"})
            # sdatetime = smalltext.text
            # sdatetime = sdatetime.replace(u"\xab","") # Removing unnecessary characters
            # sdatetime = sdatetime.replace(u"\xbb","") # Removing unnecessary characters
            # sdatetime = sdatetime.split("on: ")       # Removing unnecessary characters
            # sdatetime = sdatetime[1].strip()
            # stime = sdatetime[:-12:-1]                # Finding the time of the post
            # stime = stime[::-1]
            # sdate = sdatetime.replace(stime,"")       # Finding the date of the post
            # sdate = sdate.replace(",","")
            # sdate = sdate.strip()
            # Covert the date of the post that can be informed as: "12 February 2016", "today", "yesterday". We need
            # a date format here as "mm/dd/yyyy"
            # addDate.append(convertDate(sdate,"english", crawlerDate) + " " + stime)
            # Finding the post
            inner = postarea.find('div', {"class": "post_body scaleimages"})
            inner = inner.text.strip()
            # print(inner)
            post.append(cleanString(inner))
            # Finding the users's signature
            # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
            signature = ipost.find('div', {"class": "signature scaleimages"})
            if signature != None:
                signature = signature.text.strip()
                # print(signature)
            else:
                signature = "-1"
            sign.append(cleanString(signature))
            # As no information about users's feedback was found, just assign "-1" to the variable
            feedback.append("-1")
    except:
        if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ":
            user.append("-1")
            status.append(-1)
            interest.append(-1)
            reputation.append(-1)
            addDate.append(-1)
            post.append("NO ACCESS TO THIS PAGE!")
            sign.append(-1)
            feedback.append(-1)
    # Populate the final variable (this should be a list with all fields scraped)
    row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
    # Sending the results
    return row
 # This is the method to parse the Listing Pages (one page with many posts)
 def nulled_listing_parser(soup):
    board = "-1"       # board name (the previous level of the topic in the Forum categorization tree.
                       # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
    nm = 0             # this variable should receive the number of topics
    topic = []         # all topics
    user = []          # all users of each topic
    post = []          # number of posts of each topic
    view = []          # number of views of each topic
    addDate = []       # when the topic was created (difficult to find)
    href = []          # this variable should receive all cleaned urls (we will use this to do the marge between
                       # Listing and Description pages)
    # Finding the board (should be just one)
    board = soup.find('span', {"class": "active"}).text
    board = cleanString(board.strip())
    # Finding the repeated tag that corresponds to the listing of topics
    itopics = soup.find_all('tr', {"class": "inline_row"})
    index = 0
    for itopic in itopics:
        # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
        # to don't miss any topic
        # Adding the topic to the topic list
        try:
            topics = itopic.find('span', {"class": "subject_old"}).find('a').text
        except:
            topics = itopic.find('span', {"class": "subject_new"}).find('a').text
        topics = re.sub("\[\w*\]", '', topics)
        topic.append(cleanString(topics))
        # Counting how many topics we have found so far
        nm = len(topic)
        # Adding the url to the list of urls
        try:
            link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
        except:
            link = itopic.find('span',{"class": "subject_new"}).find('a').get('href')
        link = cleanLink(link)
        href.append(link)
        # Finding the author of the topic
        ps = itopic.find('div', {"class":"author smalltext"}).find('a').text
        author = ps.strip()
        user.append(cleanString(author))
        # Finding the number of replies
        columns = itopic.findChildren('td',recursive=False)
        posts = columns[3].text
        post.append(cleanString(posts))
        # Finding the number of Views
        tview = columns[4].text
        view.append(cleanString(tview))
        # If no information about when the topic was added, just assign "-1" to the variable
        #dt = itopic.find('div', {"class": "responsive-hide"}).text.split('»')[1]
        #dt = dt.strip()
        #date_time_obj = datetime.strptime(dt,'%a %b %d, %Y %I:%M %p')
        #addDate.append(date_time_obj)
        addDate.append("-1")
        index += 1
    return organizeTopics("Nulled", nm, topic, board, view, post, user, addDate, href)
        # if len(tag) > 0:
        #
        #     # Finding the topic
        #
        #     tds = tds[0].find(tag[0])
        #     topics = tds.text
        #     topics = topics.replace(u"\xbb","")
        #     topics = topics.strip()
        #     topic.append(cleanString(topics))
        #
        #     # Counting how many topics we have found so far
        #
        #     nm = len(topic)
        #
        #     # Adding the url to the list of urls
        #
        #     link = tds.findAll('a', href=True)
        #     link = link[0].get('href')
        #     link = cleanLink(link)
        #     href.append(link)
        #
        #     # Finding the author of the topic
        #
        #     ps = itopic.find('td', {"class": tag[1]}).find('p').find('a')
        #     if ps == None:
        #        ps = itopic.find('td', {"class": tag[1]}).find('p')
        #        ps = ps.text.replace("Started by ","")
        #     else:
        #        ps = ps.text
        #     author = ps.strip()
        #     user.append(cleanString(author))
        #
        #     # Finding the number of replies
        #
        #     statistics = itopic.find('td', {"class": tag[2]})
        #     statistics = statistics.text
        #     statistics = statistics.split("Replies")
        #     posts = statistics[0].strip()
        #     post.append(cleanString(posts))
        #
        #     # Finding the number of Views
        #
        #     views = statistics[1]
        #     views = views.replace("Views","")
        #     views = views.strip()
        #     view.append(cleanString(views))
        #
        #     # As no information about when the topic was added, just assign "-1" to the variable
        #
        #     addDate.append("-1")
    #return organizeTopics("TheMajesticGarden", nm, topic, board, view, post, user, addDate, href)
 def nulled_links_parser(soup):
    # Returning all links that should be visited by the Crawler
    href = []
    #print(soup.find('table', {"class": "tborder clear"}).find(
     #   'tbody').find_all('tr', {"class": "inline_row"}))
    listing = soup.find('tbody')
    listing=listing.find_all('tr',id=True)
    for a in listing:
        listing_rows = a.find_all('td')
        link = listing_rows[1]
        link = link.find('a',{'class':'topic_title'})
        link = link['href']
        href.append(link)
    return href
--- a/MarketPlaces/Kerberos/crawler_selenium.py
+++ b/MarketPlaces/Kerberos/crawler_selenium.py
@ -0,0 +1,337 @@
 __author__ = 'DarkWeb'
 '''
 Kerberos Market Crawler (Selenium)
 '''
 from selenium import webdriver
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
 from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 from selenium.webdriver.firefox.service import Service
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 from PIL import Image
 import urllib.parse as urlparse
 import os, time
 from datetime import date
 import subprocess
 from bs4 import BeautifulSoup
 from MarketPlaces.Initialization.prepare_parser import new_parse
 from MarketPlaces.Kerberos.parser import kerberos_links_parser
 from MarketPlaces.Utilities.utilities import cleanHTML
 counter = 1
 baseURL = 'http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion'
 # Opens Tor Browser, crawls the website
 def startCrawling():
    opentor()
    # marketName = getMarketName()
    driver = getAccess()
    if driver != 'down':
        captcha(driver)
        login(driver)
        crawlForum(driver)
    # new_parse(marketName, False)
    closetor(driver)
 # Opens Tor Browser
 def opentor():
    global pid
    print("Connecting Tor...")
    path = open('../../path.txt').readline().strip()
    pro = subprocess.Popen(path)
    pid = pro.pid
    time.sleep(7.5)
    input('Tor Connected. Press ENTER to continue\n')
    return
 def captcha(driver):
    # wait for captcha page
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/div/div/img")))
    # too hard to code, requires manual completion
    # wait for login page
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/div[1]/div[2]/div/form/div[10]/button")))
 # Login using premade account credentials and do login captcha manually
 def login(driver):
    #wait for login page
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/div[1]/div[2]/div/form/div[10]/button")))
    #entering username and password into input boxes
    usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/form/input[1]')
    #Username here
    usernameBox.send_keys('blabri')
    passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/form/input[2]')
    #Password here
    passwordBox.send_keys('fishowal')
    # wait for captcha page show up
    # WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
    #     (By.XPATH, "/html/body/div/img[24]")))
    time.sleep(10)
    # save captcha to local
    driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/form/div[6]').screenshot(
        r'..\Kerberos\captcha.png')
    # This method will show image in any image viewer
    im = Image.open(r'..\Kerberos\captcha.png')
    im.show()
    # wait until input space show up
    inputBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/form/input[3]')
    # ask user input captcha solution in terminal
    userIn = input("Enter solution: ")
    # send user solution into the input space
    inputBox.send_keys(userIn)
    # click the verify(submit) button
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    driver.find_element(by=By.XPATH, value="/html/body/div[1]/div[2]/div/form/div[10]/button").click()
    # wait for listing page show up (This Xpath may need to change based on different seed url)
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, '//*[@id="breadcrumb"]')))
 # Returns the name of the website
 def getMarketName():
    name = 'Kerberos'
    return name
 # Return the link of the website
 def getFixedURL():
    url = 'http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion'
    return url
 # Closes Tor Browser
 def closetor(driver):
    # global pid
    # os.system("taskkill /pid " + str(pro.pid))
    # os.system("taskkill /t /f /im tor.exe")
    print('Closing Tor...')
    driver.quit()
    time.sleep(3)
    return
 # Creates FireFox 'driver' and configure its 'Profile'
 # to use Tor proxy and socket
 def createFFDriver():
    file = open('../../path.txt', 'r')
    lines = file.readlines()
    ff_binary = FirefoxBinary(lines[0].strip())
    ff_prof = FirefoxProfile(lines[1].strip())
    ff_prof.set_preference("places.history.enabled", False)
    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
    ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
    ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
    ff_prof.set_preference("signon.rememberSignons", False)
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    ff_prof.set_preference("network.dns.disablePrefetch", True)
    # ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("permissions.default.image", 3)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
    ff_prof.set_preference('network.proxy.type', 1)
    ff_prof.set_preference("network.proxy.socks_version", 5)
    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
    ff_prof.set_preference('network.proxy.socks_port', 9150)
    ff_prof.set_preference('network.proxy.socks_remote_dns', True)
    ff_prof.set_preference("javascript.enabled", False)
    ff_prof.update_preferences()
    service = Service(executable_path=lines[2].strip())
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
    return driver
 def getAccess():
    url = getFixedURL()
    driver = createFFDriver()
    try:
        driver.get(url)
        return driver
    except:
        return 'down'
 # Saves the crawled html page
 def savePage(page, url):
    cleanPage = cleanHTML(page)
    filePath = getFullPathName(url)
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    open(filePath, 'wb').write(cleanPage.encode('utf-8'))
    return
 # Gets the full path of the page to be saved along with its appropriate file name
 def getFullPathName(url):
    fileName = getNameFromURL(url)
    if isDescriptionLink(url):
        fullPath = r'..\Kerberos\HTML_Pages\\' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
    else:
        fullPath = r'..\Kerberos\HTML_Pages\\' + str(
            "%02d" % date.today().month) + str("%02d" % date.today().day) + str(
            "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
    return fullPath
 # Creates the file name from passed URL
 def getNameFromURL(url):
    global counter
    name = ''.join(e for e in url if e.isalnum())
    if name == '':
        name = str(counter)
        counter = counter + 1
    return name
 def getInterestedLinks():
    links = []
    # Services - Hacking
    links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/99/block/price-none/ww/ww/1/')
    # Tutorials - Hacking
    links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/122/block/price-none/ww/ww/1/')
    # Tutorials - Guides
    links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/124/block/price-none/ww/ww/1/')
    # Tutorials - Other
    links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/126/block/price-none/ww/ww/1/')
    # Software and Malware - Botnets
    links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/129/block/price-none/ww/ww/1/')
    # Software and Malware - Malware
    links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/130/block/price-none/ww/ww/1/')
    # Software and Malware - Trojans
    links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/131/block/price-none/ww/ww/1/')
    # Software and Malware - Exploits / Kits
    links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/133/block/price-none/ww/ww/1/')
    # Software and Malware - Other
    links.append('http://kerberosazmnfrjinmftp3im3cr7hw4nxbavm4ngofn64g24be7h3kqd.onion/categories/136/block/price-none/ww/ww/1/')
    return links
 def crawlForum(driver):
    print("Crawling the Kerberos market")
    linksToCrawl = getInterestedLinks()
    visited = set(linksToCrawl)
    initialTime = time.time()
    i = 0
    count = 0
    while i < len(linksToCrawl):
        link = linksToCrawl[i]
        print('Crawling :', link)
        try:
            try:
                driver.get(link)
            except:
                driver.refresh()
            html = driver.page_source
            savePage(html, link)
            has_next_page = True
            while has_next_page:
                list = productPages(html)
                for item in list:
                    itemURL = urlparse.urljoin(baseURL, str(item))
                    try:
                        driver.get(itemURL)
                    except:
                        driver.refresh()
                    savePage(driver.page_source, item)
                    driver.back()
                    # break
                if count == 1:
                    count = 0
                    break
                try:
                    nav = driver.find_element(by=By.XPATH, value=
                        '/html/body/div[3]/div[4]/div[4]/div/div[1]/div[28]')
                    a = nav.find_element(by=By.LINK_TEXT, value="Next")
                    link = a.get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    try:
                        driver.get(link)
                    except:
                        driver.refresh()
                    html = driver.page_source
                    savePage(html, link)
                    count += 1
                except NoSuchElementException:
                    has_next_page = False
        except Exception as e:
            print(link, e)
        i += 1
    # finalTime = time.time()
    # print finalTime - initialTime
    input("Crawling Kerberos market done sucessfully. Press ENTER to continue\n")
 # Returns 'True' if the link is Topic link
 def isDescriptionLink(url):
    if 'item' in url:
        return True
    return False
 # Returns True if the link is a listingPage link
 def isListingLink(url):
    if 'categories' in url:
        return True
    return False
 # calling the parser to define the links
 def productPages(html):
    soup = BeautifulSoup(html, "html.parser")
    return kerberos_links_parser(soup)
 def crawler():
    startCrawling()
    # print("Crawling and Parsing BestCardingWorld .... DONE!")
--- a/MarketPlaces/Kerberos/parser.py
+++ b/MarketPlaces/Kerberos/parser.py
@ -0,0 +1,249 @@
 __author__ = 'DarkWeb'
 # Here, we are importing the auxiliary functions to clean or convert data
 from MarketPlaces.Utilities.utilities import *
 # Here, we are importing BeautifulSoup to search through the HTML tree
 from bs4 import BeautifulSoup
 # This is the method to parse the Description Pages (one page to each Product in the Listing Pages)
 def kerberos_description_parser(soup):
    # Fields to be parsed
    name = "-1"                         # 0 Product_Name y
    describe = "-1"                     # 1 Product_Description y
    lastSeen = "-1"                     # 2 Product_LastViewDate
    rules = "-1"                        # 3 NOT USED ...
    CVE = "-1"                          # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
    MS = "-1"                           # 5 Product_MS_Classification (Microsoft Security)
    review = "-1"                       # 6 Product_Number_Of_Reviews
    category = "-1"                     # 7 Product_Category
    shipFrom = "-1"                     # 8 Product_ShippedFrom
    shipTo = "-1"                       # 9 Product_ShippedTo
    left = "-1"                         # 10 Product_QuantityLeft y
    escrow = "-1"                       # 11 Vendor_Warranty y
    terms = "-1"                        # 12 Vendor_TermsAndConditions
    vendor = "-1"                       # 13 Vendor_Name y
    sold = "-1"                         # 14 Product_QuantitySold y
    addDate = "-1"                      # 15 Product_AddedDate
    available = "-1"                    # 16 NOT USED ...
    endDate = "-1"                      # 17 NOT USED ...
    BTC = "-1"                          # 18 Product_BTC_SellingPrice y
    USD = "-1"                          # 19 Product_USD_SellingPrice y
    rating = "-1"                       # 20 Vendor_Rating
    success = "-1"                      # 21 Vendor_Successful_Transactions
    EURO = "-1"                         # 22 Product_EURO_SellingPrice
    bae = soup.find('div', {'class': "col-9"})
    # Finding Product Name
    name = bae.find('h2').text
    name = name.replace('\n', ' ')
    name = name.replace(",", "")
    name = name.strip()
    mb = bae.findAll('div', {"class": "mb-1"})
    # Finding Vendor
    vendor = mb[0].text
    vendor = vendor.replace(",", "")
    vendor = vendor.replace("Sold by:", "")
    vendor = vendor.strip()
    # # Finding Vendor Rating
    # full_stars = bae[2].find_all('i', {'class': "fas fa-star"})
    # half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"})
    # rating = len(full_stars) + (0.5 if half_star is not None else 0)
    # Finding Warranty
    escrow = mb[2].text
    escrow = escrow.replace("Payment:", "")
    escrow = escrow.strip()
    # Finding Quantity Sold and Left
    temp = mb[4].text.split(',')
    sold = temp[0].replace("sold", "")
    sold = sold.strip()
    left = temp[1].replace("in stock", "")
    left = left.strip()
    # Finding USD
    USD = bae.find('div', {"class": "h3 text-secondary"}).text
    USD = USD.replace("$", "")
    USD = USD.strip()
    # Finding BTC
    temp = bae.find('div', {"class": "small"}).text.split("BTC")
    BTC = temp[0].strip()
    # shipping_info = bae[4].text
    # if "Digital" not in shipping_info:
    #     shipping_info = shipping_info.split("  ")
    #
    #     # Finding Shipment Information (Origin)
    #     shipFrom = shipping_info[0].strip()
    #
    #     # Finding Shipment Information (Destination)
    #     shipTo = shipping_info[1].strip()
    # Finding the Product description
    describe = bae.find('div', {"class": "card border-top-0"}).text
    describe = describe.replace("\n", " ")
    describe = describe.replace("\r", " ")
    describe = describe.strip()
    # Searching for CVE and MS categories
    cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
    if cve:
        CVE = " "
        for idx in cve:
            CVE += (idx)
            CVE += "  "
            CVE = CVE.replace(',', ' ')
            CVE = CVE.replace('\n', '')
    ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
    if ms:
        MS = " "
        for im in ms:
            MS += (im)
            MS += " "
            MS = MS.replace(',', ' ')
            MS = MS.replace('\n', '')
    # Populating the final variable (this should be a list with all fields scraped)
    row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,
       sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
    # Sending the results
    return row
 # This is the method to parse the Listing Pages
 def kerberos_listing_parser(soup):
    # Fields to be parsed
    nm = 0                                    # Total_Products (Should be Integer)
    mktName = "Kerberos"                      # 0 Marketplace_Name
    name = []                                 # 1 Product_Name y
    CVE = []                                  # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
    MS = []                                   # 3 Product_MS_Classification (Microsoft Security)
    category = []                             # 4 Product_Category y
    describe = []                             # 5 Product_Description
    escrow = []                               # 6 Vendor_Warranty
    views = []                                # 7 Product_Number_Of_Views
    reviews = []                              # 8 Product_Number_Of_Reviews y
    addDate = []                              # 9 Product_AddDate
    lastSeen = []                             # 10 Product_LastViewDate
    BTC = []                                  # 11 Product_BTC_SellingPrice
    USD = []                                  # 12 Product_USD_SellingPrice y
    EURO = []                                 # 13 Product_EURO_SellingPrice
    sold = []                                 # 14 Product_QuantitySold
    qLeft =[]                                 # 15 Product_QuantityLeft
    shipFrom = []                             # 16 Product_ShippedFrom
    shipTo = []                               # 17 Product_ShippedTo
    vendor = []                               # 18 Vendor y
    rating = []                               # 19 Vendor_Rating
    success = []                              # 20 Vendor_Successful_Transactions
    href = []                                 # 24 Product_Links (Urls)
    listing = soup.findAll('div', {"class": "card product-card mb-3"})
    # Populating the Number of Products
    nm = len(listing)
    # Finding Category
    cat = soup.find("div", {"class": "col-9"})
    cat = cat.find("h2").text
    cat = cat.replace("Category: ", "")
    cat = cat.replace(",", "")
    cat = cat.strip()
    for card in listing:
        category.append(cat)
        bae = card.findAll('a')
        # Adding the url to the list of urls
        link = bae[0].get('href')
        link = cleanLink(link)
        href.append(link)
        # Finding Product Name
        product = bae[1].text
        product = product.replace('\n', ' ')
        product = product.replace(",", "")
        product = product.strip()
        name.append(product)
        # Finding Vendor
        vendor_name = bae[2].text
        vendor_name = vendor_name.replace(",", "")
        vendor_name = vendor_name.strip()
        vendor.append(vendor_name)
        # Finding USD
        usd = card.find('div', {"class": "mb-1"}).text
        usd = usd.replace("$", "")
        usd = usd.strip()
        USD.append(usd)
        # Finding Reviews
        num = card.find("span", {"class": "rate-count"}).text
        num = num.replace("(", "")
        num = num.replace("review)", "")
        num = num.replace("reviews)", "")
        num = num.strip()
        reviews.append(num)
        # Searching for CVE and MS categories
        cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
        if not cve:
            cveValue="-1"
        else:
            cee = " "
            for idx in cve:
                cee += (idx)
                cee += "  "
                cee = cee.replace(',', ' ')
                cee = cee.replace('\n', '')
            cveValue=cee
        CVE.append(cveValue)
        ms = card.findAll(text=re.compile('MS\d{2}-\d{3}'))
        if not ms:
            MSValue="-1"
        else:
            me = " "
            for im in ms:
                me += (im)
                me += " "
                me = me.replace(',', ' ')
                me = me.replace('\n', '')
            MSValue=me
        MS.append(MSValue)
    # Populate the final variable (this should be a list with all fields scraped)
    return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
                     BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
 def kerberos_links_parser(soup):
    # Returning all links that should be visited by the Crawler
    href = []
    content = soup.find('div', {"id": "content-pos"})
    listing = content.findAll('div', {"class": "item-block"})
    for div in listing:
        ae = div.find('div', {"ae zx300"})
        links = ae.findAll('a')
        href.append(links[1]['href'])
    return href