khangtran
/
dark_web_forums

__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert datafrom MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML treefrom bs4 import BeautifulSoup

# This is the method to parse the Description Pages (one page to each Product in the Listing Pages)def kerberos_description_parser(soup):
    # Fields to be parsed
    name = "-1"                         # 0 Product_Name y    describe = "-1"                     # 1 Product_Description y    lastSeen = "-1"                     # 2 Product_LastViewDate    rules = "-1"                        # 3 NOT USED ...    CVE = "-1"                          # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)    MS = "-1"                           # 5 Product_MS_Classification (Microsoft Security)    review = "-1"                       # 6 Product_Number_Of_Reviews    category = "-1"                     # 7 Product_Category    shipFrom = "-1"                     # 8 Product_ShippedFrom    shipTo = "-1"                       # 9 Product_ShippedTo    left = "-1"                         # 10 Product_QuantityLeft y    escrow = "-1"                       # 11 Vendor_Warranty y    terms = "-1"                        # 12 Vendor_TermsAndConditions    vendor = "-1"                       # 13 Vendor_Name y    sold = "-1"                         # 14 Product_QuantitySold y    addDate = "-1"                      # 15 Product_AddedDate    available = "-1"                    # 16 NOT USED ...    endDate = "-1"                      # 17 NOT USED ...    BTC = "-1"                          # 18 Product_BTC_SellingPrice y    USD = "-1"                          # 19 Product_USD_SellingPrice y    rating = "-1"                       # 20 Vendor_Rating    success = "-1"                      # 21 Vendor_Successful_Transactions    EURO = "-1"                         # 22 Product_EURO_SellingPrice
    bae = soup.find('div', {'class': "col-9"})
    # Finding Product Name    name = bae.find('h2').text    name = name.replace('\n', ' ')    name = name.replace(",", "")    name = name.strip()
    mb = bae.findAll('div', {"class": "mb-1"})
    # Finding Vendor    vendor = mb[0].text    vendor = vendor.replace(",", "")    vendor = vendor.replace("Sold by:", "")    vendor = vendor.strip()
    # # Finding Vendor Rating    # full_stars = bae[2].find_all('i', {'class': "fas fa-star"})    # half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"})    # rating = len(full_stars) + (0.5 if half_star is not None else 0)
    # Finding Warranty    escrow = mb[2].text    escrow = escrow.replace("Payment:", "")    escrow = escrow.strip()
    # Finding Quantity Sold and Left    temp = mb[4].text.split(',')
    sold = temp[0].replace("sold", "")    sold = sold.strip()
    left = temp[1].replace("in stock", "")    left = left.strip()
    # Finding USD    USD = bae.find('div', {"class": "h3 text-secondary"}).text    USD = USD.replace("$", "")    USD = USD.strip()
    # Finding BTC    temp = bae.find('div', {"class": "small"}).text.split("BTC")
    BTC = temp[0].strip()
    # shipping_info = bae[4].text    # if "Digital" not in shipping_info:    #     shipping_info = shipping_info.split("  ")    #    #     # Finding Shipment Information (Origin)    #     shipFrom = shipping_info[0].strip()    #    #     # Finding Shipment Information (Destination)    #     shipTo = shipping_info[1].strip()
    # Finding the Product description    describe = bae.find('div', {"class": "card border-top-0"}).text    describe = describe.replace("\n", " ")    describe = describe.replace("\r", " ")    describe = describe.strip()
    # Searching for CVE and MS categories    cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))    if cve:        CVE = " "        for idx in cve:            CVE += (idx)            CVE += "  "            CVE = CVE.replace(',', ' ')            CVE = CVE.replace('\n', '')    ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))    if ms:        MS = " "        for im in ms:            MS += (im)            MS += " "            MS = MS.replace(',', ' ')            MS = MS.replace('\n', '')
    # Populating the final variable (this should be a list with all fields scraped)    row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor,       sold, addDate, available, endDate, BTC, USD, rating, success, EURO)
    # Sending the results    return row

# This is the method to parse the Listing Pagesdef kerberos_listing_parser(soup):
    # Fields to be parsed    nm = 0                                    # Total_Products (Should be Integer)    mktName = "Kerberos"                      # 0 Marketplace_Name    name = []                                 # 1 Product_Name y    CVE = []                                  # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)    MS = []                                   # 3 Product_MS_Classification (Microsoft Security)    category = []                             # 4 Product_Category y    describe = []                             # 5 Product_Description    escrow = []                               # 6 Vendor_Warranty    views = []                                # 7 Product_Number_Of_Views    reviews = []                              # 8 Product_Number_Of_Reviews y    addDate = []                              # 9 Product_AddDate    lastSeen = []                             # 10 Product_LastViewDate    BTC = []                                  # 11 Product_BTC_SellingPrice    USD = []                                  # 12 Product_USD_SellingPrice y    EURO = []                                 # 13 Product_EURO_SellingPrice    sold = []                                 # 14 Product_QuantitySold    qLeft =[]                                 # 15 Product_QuantityLeft    shipFrom = []                             # 16 Product_ShippedFrom    shipTo = []                               # 17 Product_ShippedTo    vendor = []                               # 18 Vendor y    rating = []                               # 19 Vendor_Rating    success = []                              # 20 Vendor_Successful_Transactions    href = []                                 # 24 Product_Links (Urls)
    listing = soup.findAll('div', {"class": "card product-card mb-3"})
    # Populating the Number of Products    nm = len(listing)
    # Finding Category    cat = soup.find("div", {"class": "col-9"})    cat = cat.find("h2").text    cat = cat.replace("Category: ", "")    cat = cat.replace(",", "")    cat = cat.strip()
    for card in listing:        category.append(cat)
        bae = card.findAll('a')
        # Adding the url to the list of urls        link = bae[0].get('href')        link = cleanLink(link)        href.append(link)
        # Finding Product Name        product = bae[1].text        product = product.replace('\n', ' ')        product = product.replace(",", "")        product = product.strip()        name.append(product)
        # Finding Vendor        vendor_name = bae[2].text        vendor_name = vendor_name.replace(",", "")        vendor_name = vendor_name.strip()        vendor.append(vendor_name)
        # Finding USD        usd = card.find('div', {"class": "mb-1"}).text        usd = usd.replace("$", "")        usd = usd.strip()        USD.append(usd)
        # Finding Reviews        num = card.find("span", {"class": "rate-count"}).text        num = num.replace("(", "")        num = num.replace("review)", "")        num = num.replace("reviews)", "")        num = num.strip()        reviews.append(num)
        # Searching for CVE and MS categories        cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}'))        if not cve:            cveValue="-1"        else:            cee = " "            for idx in cve:                cee += (idx)                cee += "  "                cee = cee.replace(',', ' ')                cee = cee.replace('\n', '')            cveValue=cee        CVE.append(cveValue)                ms = card.findAll(text=re.compile('MS\d{2}-\d{3}'))        if not ms:            MSValue="-1"        else:            me = " "            for im in ms:                me += (im)                me += " "                me = me.replace(',', ' ')                me = me.replace('\n', '')            MSValue=me        MS.append(MSValue)
    # Populate the final variable (this should be a list with all fields scraped)    return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,                     BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)

def kerberos_links_parser(soup):
    # Returning all links that should be visited by the Crawler    href = []
    content = soup.find('div', {"id": "content-pos"})    listing = content.findAll('div', {"class": "item-block"})
    for div in listing:
        ae = div.find('div', {"ae zx300"})        links = ae.findAll('a')        href.append(links[1]['href'])
    return href