khangtran
/
dark_web_forums

__author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert datafrom MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML treefrom bs4 import BeautifulSoup, ResultSet, Tag

def apocalypse_description_parser(soup: Tag):        # Fields to be parsed
    vendor = "-1"                       # 0 *Vendor_Name    success = "-1"                      # 1 Vendor_Successful_Transactions    rating_vendor = "-1"                # 2 Vendor_Rating    name = "-1"                         # 3 *Product_Name    describe = "-1"                     # 4 Product_Description    CVE = "-1"                          # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)    MS = "-1"                           # 6 Product_MS_Classification (Microsoft Security)    category = "-1"                     # 7 Product_Category    views = "-1"                        # 8 Product_Number_Of_Views    reviews = "-1"                      # 9 Product_Number_Of_Reviews    rating_item = "-1"                  # 10 Product_Rating    addDate = "-1"                      # 11 Product_AddedDate    BTC = "-1"                          # 12 Product_BTC_SellingPrice    USD = "-1"                          # 13 Product_USD_SellingPrice    EURO = "-1"                         # 14 Product_EURO_SellingPrice    sold = "-1"                         # 15 Product_QuantitySold    left = "-1"                         # 16 Product_QuantityLeft    shipFrom = "-1"                     # 17 Product_ShippedFrom    shipTo = "-1"                       # 18 Product_ShippedTo    image = "-1"                        # 19 Product_Image    vendor_image = "-1"                 # 20 Vendor_Image
    content: Tag = soup.find("div", {'id': "article_page"})        product_name = content.find("p", {"class": "list-group-item text-center mb-0 box"}).text    name = cleanString(product_name.strip())
    product_description = content.find("pre").text     describe = cleanString(product_description.strip())
    # Finding Product Image    image = soup.find('div', {'class': 'col-md-7 text-center'}).find('img')    image = image.get('src').split('base64,')[-1]
    product_reviews_list: Tag = content.find("table", {"class": "table product_reviews"}) \                               .find_all("li")        review = str(len(product_reviews_list))        product_category = content.find("a", {"class": "badge badge-danger"}).text    category = cleanString(product_category.strip())        product_ships_from = content.find("span", {"class": "badge badge-info"}).text    shipFrom = cleanString(product_ships_from.strip())        product_success_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-success"})    product_ships_to = product_success_badge[1].text    shipTo = cleanString(product_ships_to.strip())        product_supply = content.find("span", {"class": "badge badge-warning"}).text    left = cleanString(product_supply.strip())        product_primary_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-primary"})        # Product vendor comes in the form of "@ vendor_name"    product_vendor = product_primary_badge[0].text.replace("@", "")        vendor = cleanString(product_vendor.strip())    sold = cleanString(product_primary_badge[1].text.strip())        product_prices: Tag = content.find("p", {"style": "border-bottom:1px solid GREY;"})    USD = product_prices.find("span", {"class": "pr"}).text    prices_array: ResultSet[Tag] = product_prices.find_all("span", {"class": "pr1"})    BTC = prices_array[1].text        # Populating the final variable (this should be a list with all fields scraped)    row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,           BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)
    # Sending the results    return row

def apocalypse_listing_parser(soup: Tag):        # Fields to be parsed    nm = 0                                    # Total_Products (Should be Integer)    mktName = "Apocalypse"                    # 0 Marketplace_Name    name = []                                 # 1 Product_Name    CVE = []                                  # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)    MS = []                                   # 3 Product_MS_Classification (Microsoft Security)    category = []                             # 4 Product_Category    describe = []                             # 5 Product_Description    escrow = []                               # 6 Vendor_Warranty    views = []                                # 7 Product_Number_Of_Views    reviews = []                              # 8 Product_Number_Of_Reviews    addDate = []                              # 9 Product_AddDate    lastSeen = []                             # 10 Product_LastViewDate    BTC = []                                  # 11 Product_BTC_SellingPrice    USD = []                                  # 12 Product_USD_SellingPrice    EURO = []                                 # 13 Product_EURO_SellingPrice    sold = []                                 # 14 Product_QuantitySold    qLeft =[]                                 # 15 Product_QuantityLeft    shipFrom = []                             # 16 Product_ShippedFrom    shipTo = []                               # 17 Product_ShippedTo    vendor = []                               # 18 Vendor    rating = []                               # 19 Vendor_Rating    success = []                              # 20 Vendor_Successful_Transactions    image = []                                # 20 Product_Image    image_vendor = []                         # 21 Vendor_Image    href = []                                 # 22 Product_Links
    table = soup.find("div", {"class": "col-lg-9 my-4"})    if table is None:        table = soup.find("div", {"class": "col-lg-9"})    listings: ResultSet[Tag] = table.find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})        for prod in listings:                product_name = prod.find('h5', {"class": "art_title"}).text        name.append(cleanString(product_name.strip()))
        # Finding Product Image        product_image = prod.find('img', {'class': 'customHeight'})        product_image = product_image.get('src').split('base64,')[-1]        image.append(product_image)                CVE.append("-1")        MS.append("-1")        describe.append("-1")        escrow.append("-1")        reviews.append("-1")        addDate.append("-1")        lastSeen.append("-1")        BTC.append("-1")        EURO.append("-1")        shipTo.append("-1")        success.append("-1")        image_vendor.append("-1")                product_price = prod.find("span", {"class": "priceP"}).text        USD.append(cleanString(product_price.strip()))                product_sold = prod.find("span", {"class": "badge badge-success"}).text        sold.append(cleanString(product_sold.strip()))                product_statistics: ResultSet[Tag] = prod.find_all("p", {"class": "mb-0 card-text"})                product_category = product_statistics[0].find("a").text        category.append(cleanString(product_category.strip()))                product_sold = product_statistics[1].find("span").text        sold.append(cleanString(product_sold.strip()))                product_quantity_left = product_statistics[2].find("span", {"class": "badge bluebadge"}).text        qLeft.append(cleanString(product_quantity_left.strip()))                product_views = product_statistics[3].find("span").text        views.append(cleanString(product_views.strip()))                product_ships_from = product_statistics[4].find("span").text        shipFrom.append(cleanString(product_ships_from.strip()))                product_vendor_tag: Tag = product_statistics[5].find("a").find("span", {"class": "badge badge-primary"})        # Product vendors & ratings are displayed as "vender_name ★ 5.0"        # When split by the star (★), it should return a 2-value array        product_vendor, product_vendor_rating = product_vendor_tag.text.split("★")                try:            vendor.append(cleanString(product_vendor.strip()))            rating.append(cleanString(product_vendor_rating.strip()))        except Exception as e:            raise e
        product_href = prod.find('a').get('href')        href.append(product_href)                nm += 1
    return organizeProducts(        marketplace=mktName,        nm=nm,        vendor=vendor,        rating_vendor=rating,        success_vendor=success,        nombre=name,        CVE=CVE,        MS=MS,        category=category,        describe=describe,        views=views,        reviews=reviews,        rating_item=["-1" for _ in range(nm)],        addDate=addDate,        BTC=BTC,        USD=USD,        EURO=EURO,        sold=sold,        qLeft=qLeft,        shipFrom=shipFrom,        shipTo=shipTo,        href=href,        image=image,        image_vendor=image_vendor    )

#called by the crawler to get description links on a listing page#@param: beautifulsoup object that is using the correct html page (listing page)#return: list of description links from a listing pagedef apocalypse_links_parser(soup):
    # Returning all links that should be visited by the Crawler
    href = []    listing = soup.findAll('div', {"class": "col-lg-4 col-md-6 mb-1"})
    for a in listing:        bae = a.find('a', href=True)        link = bae['href']        href.append(link)
    return href