khangtran
/
dark_web_forums


								__author__ = 'DarkWeb'


								# Here, we are importing the auxiliary functions to clean or convert data

								from MarketPlaces.Utilities.utilities import *


								# Here, we are importing BeautifulSoup to search through the HTML tree

								from bs4 import BeautifulSoup


								# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)

								def kerberos_description_parser(soup):


								    # Fields to be parsed


								    name = "-1"                         # 0 Product_Name y

								    describe = "-1"                     # 1 Product_Description y

								    lastSeen = "-1"                     # 2 Product_LastViewDate

								    rules = "-1"                        # 3 NOT USED ...

								    CVE = "-1"                          # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)

								    MS = "-1"                           # 5 Product_MS_Classification (Microsoft Security)

								    review = "-1"                       # 6 Product_Number_Of_Reviews

								    category = "-1"                     # 7 Product_Category

								    shipFrom = "-1"                     # 8 Product_ShippedFrom

								    shipTo = "-1"                       # 9 Product_ShippedTo

								    left = "-1"                         # 10 Product_QuantityLeft y

								    escrow = "-1"                       # 11 Vendor_Warranty y

								    terms = "-1"                        # 12 Vendor_TermsAndConditions

								    vendor = "-1"                       # 13 Vendor_Name y

								    sold = "-1"                         # 14 Product_QuantitySold y

								    addDate = "-1"                      # 15 Product_AddedDate

								    available = "-1"                    # 16 NOT USED ...

								    endDate = "-1"                      # 17 NOT USED ...

								    BTC = "-1"                          # 18 Product_BTC_SellingPrice y

								    USD = "-1"                          # 19 Product_USD_SellingPrice y

								    rating = "-1"                       # 20 Vendor_Rating

								    success = "-1"                      # 21 Vendor_Successful_Transactions

								    EURO = "-1"                         # 22 Product_EURO_SellingPrice


								    bae = soup.find('div', {'class': "col-9"})


								    # Finding Product Name

								    name = bae.find('h2').text

								    name = name.replace('\n', ' ')

								    name = name.replace(",", "")

								    name = name.strip()


								    mb = bae.findAll('div', {"class": "mb-1"})


								    # Finding Vendor

								    vendor = mb[0].text

								    vendor = vendor.replace(",", "")

								    vendor = vendor.replace("Sold by:", "")

								    vendor = vendor.strip()


								    # # Finding Vendor Rating

								    # full_stars = bae[2].find_all('i', {'class': "fas fa-star"})

								    # half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"})

								    # rating = len(full_stars) + (0.5 if half_star is not None else 0)


								    # Finding Warranty

								    escrow = mb[2].text

								    escrow = escrow.replace("Payment:", "")

								    escrow = escrow.strip()


								    # Finding Quantity Sold and Left

								    temp = mb[4].text.split(',')


								    sold = temp[0].replace("sold", "")

								    sold = sold.strip()


								    left = temp[1].replace("in stock", "")

								    left = left.strip()


								    # Finding USD

								    USD = bae.find('div', {"class": "h3 text-secondary"}).text

								    USD = USD.replace("$", "")

								    USD = USD.strip()


								    # Finding BTC

								    temp = bae.find('div', {"class": "small"}).text.split("BTC")


								    BTC = temp[0].strip()


								    # shipping_info = bae[4].text

								    # if "Digital" not in shipping_info:

								    #     shipping_info = shipping_info.split("  ")

								    #

								    #     # Finding Shipment Information (Origin)

								    #     shipFrom = shipping_info[0].strip()

								    #

								    #     # Finding Shipment Information (Destination)

								    #     shipTo = shipping_info[1].strip()


								    # Finding the Product description

								    describe = bae.find('div', {"class": "card border-top-0"}).text

								    describe = describe.replace("\n", " ")

								    describe = describe.replace("\r", " ")

								    describe = describe.strip()


								    # Searching for CVE and MS categories

								    cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))

								    if cve:

								        CVE = " "

								        for idx in cve:

								            CVE += (idx)

								            CVE += "  "

								            CVE = CVE.replace(',', ' ')

								            CVE = CVE.replace('\n', '')

								    ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))

								    if ms:

								        MS = " "

								        for im in ms:

								            MS += (im)

								            MS += " "

								            MS = MS.replace(',', ' ')

								            MS = MS.replace('\n', '')


								    # Populating the final variable (this should be a list with all fields scraped)

								    row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,

								       sold, addDate, available, endDate, BTC, USD, rating, success, EURO)


								    # Sending the results

								    return row


								# This is the method to parse the Listing Pages

								def kerberos_listing_parser(soup):


								    # Fields to be parsed

								    nm = 0                                    # Total_Products (Should be Integer)

								    mktName = "Kerberos"                      # 0 Marketplace_Name

								    name = []                                 # 1 Product_Name y

								    CVE = []                                  # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)

								    MS = []                                   # 3 Product_MS_Classification (Microsoft Security)

								    category = []                             # 4 Product_Category y

								    describe = []                             # 5 Product_Description

								    escrow = []                               # 6 Vendor_Warranty

								    views = []                                # 7 Product_Number_Of_Views

								    reviews = []                              # 8 Product_Number_Of_Reviews y

								    addDate = []                              # 9 Product_AddDate

								    lastSeen = []                             # 10 Product_LastViewDate

								    BTC = []                                  # 11 Product_BTC_SellingPrice

								    USD = []                                  # 12 Product_USD_SellingPrice y

								    EURO = []                                 # 13 Product_EURO_SellingPrice

								    sold = []                                 # 14 Product_QuantitySold

								    qLeft =[]                                 # 15 Product_QuantityLeft

								    shipFrom = []                             # 16 Product_ShippedFrom

								    shipTo = []                               # 17 Product_ShippedTo

								    vendor = []                               # 18 Vendor y

								    rating = []                               # 19 Vendor_Rating

								    success = []                              # 20 Vendor_Successful_Transactions

								    href = []                                 # 24 Product_Links (Urls)


								    listing = soup.findAll('div', {"class": "card product-card mb-3"})


								    # Populating the Number of Products

								    nm = len(listing)


								    # Finding Category

								    cat = soup.find("div", {"class": "col-9"})

								    cat = cat.find("h2").text

								    cat = cat.replace("Category: ", "")

								    cat = cat.replace(",", "")

								    cat = cat.strip()


								    for card in listing:

								        category.append(cat)


								        bae = card.findAll('a')


								        # Adding the url to the list of urls

								        link = bae[0].get('href')

								        link = cleanLink(link)

								        href.append(link)


								        # Finding Product Name

								        product = bae[1].text

								        product = product.replace('\n', ' ')

								        product = product.replace(",", "")

								        product = product.strip()

								        name.append(product)


								        # Finding Vendor

								        vendor_name = bae[2].text

								        vendor_name = vendor_name.replace(",", "")

								        vendor_name = vendor_name.strip()

								        vendor.append(vendor_name)


								        # Finding USD

								        usd = card.find('div', {"class": "mb-1"}).text

								        usd = usd.replace("$", "")

								        usd = usd.strip()

								        USD.append(usd)


								        # Finding Reviews

								        num = card.find("span", {"class": "rate-count"}).text

								        num = num.replace("(", "")

								        num = num.replace("review)", "")

								        num = num.replace("reviews)", "")

								        num = num.strip()

								        reviews.append(num)


								        # Searching for CVE and MS categories

								        cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}'))

								        if not cve:

								            cveValue="-1"

								        else:

								            cee = " "

								            for idx in cve:

								                cee += (idx)

								                cee += "  "

								                cee = cee.replace(',', ' ')

								                cee = cee.replace('\n', '')

								            cveValue=cee

								        CVE.append(cveValue)


								        ms = card.findAll(text=re.compile('MS\d{2}-\d{3}'))

								        if not ms:

								            MSValue="-1"

								        else:

								            me = " "

								            for im in ms:

								                me += (im)

								                me += " "

								                me = me.replace(',', ' ')

								                me = me.replace('\n', '')

								            MSValue=me

								        MS.append(MSValue)


								    # Populate the final variable (this should be a list with all fields scraped)

								    return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,

								                     BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)


								def kerberos_links_parser(soup):


								    # Returning all links that should be visited by the Crawler

								    href = []


								    content = soup.find('div', {"id": "content-pos"})

								    listing = content.findAll('div', {"class": "item-block"})


								    for div in listing:


								        ae = div.find('div', {"ae zx300"})

								        links = ae.findAll('a')

								        href.append(links[1]['href'])


								    return href