khangtran
/
dark_web_forums


								__author__ = 'DarkWeb'


								# Here, we are importing the auxiliary functions to clean or convert data

								from MarketPlaces.Utilities.utilities import *


								# Here, we are importing BeautifulSoup to search through the HTML tree

								from bs4 import BeautifulSoup, ResultSet, Tag


								def apocalypse_description_parser(soup: Tag):


								    # Fields to be parsed


								    vendor = "-1"                       # 0 *Vendor_Name

								    success = "-1"                      # 1 Vendor_Successful_Transactions

								    rating_vendor = "-1"                # 2 Vendor_Rating

								    name = "-1"                         # 3 *Product_Name

								    describe = "-1"                     # 4 Product_Description

								    CVE = "-1"                          # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures)

								    MS = "-1"                           # 6 Product_MS_Classification (Microsoft Security)

								    category = "-1"                     # 7 Product_Category

								    views = "-1"                        # 8 Product_Number_Of_Views

								    reviews = "-1"                      # 9 Product_Number_Of_Reviews

								    rating_item = "-1"                  # 10 Product_Rating

								    addDate = "-1"                      # 11 Product_AddedDate

								    BTC = "-1"                          # 12 Product_BTC_SellingPrice

								    USD = "-1"                          # 13 Product_USD_SellingPrice

								    EURO = "-1"                         # 14 Product_EURO_SellingPrice

								    sold = "-1"                         # 15 Product_QuantitySold

								    left = "-1"                         # 16 Product_QuantityLeft

								    shipFrom = "-1"                     # 17 Product_ShippedFrom

								    shipTo = "-1"                       # 18 Product_ShippedTo

								    image = "-1"                        # 19 Product_Image

								    vendor_image = "-1"                 # 20 Vendor_Image


								    content: Tag = soup.find("div", {'id': "article_page"})


								    product_name = content.find("p", {"class": "list-group-item text-center mb-0 box"}).text

								    name = cleanString(product_name.strip())


								    product_description = content.find("pre").text

								    describe = cleanString(product_description.strip())


								    # Finding Product Image

								    image = soup.find('div', {'class': 'col-md-7 text-center'}).find('img')

								    image = image.get('src').split('base64,')[-1]


								    product_reviews_list: Tag = content.find("table", {"class": "table product_reviews"}) \

								                               .find_all("li")


								    review = str(len(product_reviews_list))


								    product_category = content.find("a", {"class": "badge badge-danger"}).text

								    category = cleanString(product_category.strip())


								    product_ships_from = content.find("span", {"class": "badge badge-info"}).text

								    shipFrom = cleanString(product_ships_from.strip())


								    product_success_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-success"})

								    product_ships_to = product_success_badge[1].text

								    shipTo = cleanString(product_ships_to.strip())


								    product_supply = content.find("span", {"class": "badge badge-warning"}).text

								    left = cleanString(product_supply.strip())


								    product_primary_badge: ResultSet[Tag] = content.find_all("span", {"class": "badge badge-primary"})


								    # Product vendor comes in the form of "@ vendor_name"

								    product_vendor = product_primary_badge[0].text.replace("@", "")


								    vendor = cleanString(product_vendor.strip())

								    sold = cleanString(product_primary_badge[1].text.strip())


								    product_prices: Tag = content.find("p", {"style": "border-bottom:1px solid GREY;"})

								    USD = product_prices.find("span", {"class": "pr"}).text

								    prices_array: ResultSet[Tag] = product_prices.find_all("span", {"class": "pr1"})

								    BTC = prices_array[1].text


								    # Populating the final variable (this should be a list with all fields scraped)

								    row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,

								           BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image)


								    # Sending the results

								    return row


								def apocalypse_listing_parser(soup: Tag):


								    # Fields to be parsed

								    nm = 0                                    # Total_Products (Should be Integer)

								    mktName = "Apocalypse"                    # 0 Marketplace_Name

								    name = []                                 # 1 Product_Name

								    CVE = []                                  # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)

								    MS = []                                   # 3 Product_MS_Classification (Microsoft Security)

								    category = []                             # 4 Product_Category

								    describe = []                             # 5 Product_Description

								    escrow = []                               # 6 Vendor_Warranty

								    views = []                                # 7 Product_Number_Of_Views

								    reviews = []                              # 8 Product_Number_Of_Reviews

								    addDate = []                              # 9 Product_AddDate

								    lastSeen = []                             # 10 Product_LastViewDate

								    BTC = []                                  # 11 Product_BTC_SellingPrice

								    USD = []                                  # 12 Product_USD_SellingPrice

								    EURO = []                                 # 13 Product_EURO_SellingPrice

								    sold = []                                 # 14 Product_QuantitySold

								    qLeft =[]                                 # 15 Product_QuantityLeft

								    shipFrom = []                             # 16 Product_ShippedFrom

								    shipTo = []                               # 17 Product_ShippedTo

								    vendor = []                               # 18 Vendor

								    rating = []                               # 19 Vendor_Rating

								    success = []                              # 20 Vendor_Successful_Transactions

								    image = []                                # 20 Product_Image

								    image_vendor = []                         # 21 Vendor_Image

								    href = []                                 # 22 Product_Links


								    table = soup.find("div", {"class": "col-lg-9 my-4"})

								    if table is None:

								        table = soup.find("div", {"class": "col-lg-9"})

								    listings: ResultSet[Tag] = table.find_all("div", {"class": "col-lg-4 col-md-6 mb-1"})


								    for prod in listings:


								        product_name = prod.find('h5', {"class": "art_title"}).text

								        name.append(cleanString(product_name.strip()))


								        # Finding Product Image

								        product_image = prod.find('img', {'class': 'customHeight'})

								        product_image = product_image.get('src').split('base64,')[-1]

								        image.append(product_image)


								        CVE.append("-1")

								        MS.append("-1")

								        describe.append("-1")

								        escrow.append("-1")

								        reviews.append("-1")

								        addDate.append("-1")

								        lastSeen.append("-1")

								        BTC.append("-1")

								        EURO.append("-1")

								        shipTo.append("-1")

								        success.append("-1")

								        image_vendor.append("-1")


								        product_price = prod.find("span", {"class": "priceP"}).text

								        USD.append(cleanString(product_price.strip()))


								        product_sold = prod.find("span", {"class": "badge badge-success"}).text

								        sold.append(cleanString(product_sold.strip()))


								        product_statistics: ResultSet[Tag] = prod.find_all("p", {"class": "mb-0 card-text"})


								        product_category = product_statistics[0].find("a").text

								        category.append(cleanString(product_category.strip()))


								        product_sold = product_statistics[1].find("span").text

								        sold.append(cleanString(product_sold.strip()))


								        product_quantity_left = product_statistics[2].find("span", {"class": "badge bluebadge"}).text

								        qLeft.append(cleanString(product_quantity_left.strip()))


								        product_views = product_statistics[3].find("span").text

								        views.append(cleanString(product_views.strip()))


								        product_ships_from = product_statistics[4].find("span").text

								        shipFrom.append(cleanString(product_ships_from.strip()))


								        product_vendor_tag: Tag = product_statistics[5].find("a").find("span", {"class": "badge badge-primary"})

								        # Product vendors & ratings are displayed as "vender_name ★ 5.0"

								        # When split by the star (★), it should return a 2-value array

								        product_vendor, product_vendor_rating = product_vendor_tag.text.split("★")


								        try:

								            vendor.append(cleanString(product_vendor.strip()))

								            rating.append(cleanString(product_vendor_rating.strip()))

								        except Exception as e:

								            raise e


								        product_href = prod.find('a').get('href')

								        href.append(product_href)


								        nm += 1


								    return organizeProducts(

								        marketplace=mktName,

								        nm=nm,

								        vendor=vendor,

								        rating_vendor=rating,

								        success_vendor=success,

								        nombre=name,

								        CVE=CVE,

								        MS=MS,

								        category=category,

								        describe=describe,

								        views=views,

								        reviews=reviews,

								        rating_item=["-1" for _ in range(nm)],

								        addDate=addDate,

								        BTC=BTC,

								        USD=USD,

								        EURO=EURO,

								        sold=sold,

								        qLeft=qLeft,

								        shipFrom=shipFrom,

								        shipTo=shipTo,

								        href=href,

								        image=image,

								        image_vendor=image_vendor

								    )


								#called by the crawler to get description links on a listing page

								#@param: beautifulsoup object that is using the correct html page (listing page)

								#return: list of description links from a listing page

								def apocalypse_links_parser(soup):


								    # Returning all links that should be visited by the Crawler


								    href = []

								    listing = soup.findAll('div', {"class": "col-lg-4 col-md-6 mb-1"})


								    for a in listing:

								        bae = a.find('a', href=True)

								        link = bae['href']

								        href.append(link)


								    return href