Browse Source

Created test file for class-based organization for parser data

main
Khoi 1 year ago
parent
commit
c09298652a
6 changed files with 119 additions and 7 deletions
  1. +1
    -1
      Forums/Initialization/forumsList.txt
  2. +2
    -0
      Forums/Initialization/forums_mining.py
  3. +2
    -2
      Forums/OnniForums/parser.py
  4. +2
    -2
      MarketPlaces/MikesGrandStore/parser.py
  5. +81
    -2
      MarketPlaces/ThiefWorld/parser.py
  6. +31
    -0
      test.py

+ 1
- 1
Forums/Initialization/forumsList.txt View File

@ -1 +1 @@
OnniForums
OnniForums

+ 2
- 0
Forums/Initialization/forums_mining.py View File

@ -114,6 +114,8 @@ if __name__ == '__main__':
elif forum == "HiddenAnswers": elif forum == "HiddenAnswers":
crawlerHiddenAnswers() crawlerHiddenAnswers()


+ 2
- 2
Forums/OnniForums/parser.py View File

@ -115,8 +115,8 @@ def onniForums_description_parser(soup: BeautifulSoup) -> tuple:
def onniForums_listing_parser(soup: BeautifulSoup): def onniForums_listing_parser(soup: BeautifulSoup):
boardName = "-1" # board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
boardName = "-1" # board name (the previous level of the topic in the Forum categorization tree.
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
nm = 0 # this variable should receive the number of topics nm = 0 # this variable should receive the number of topics
topic : List[str] = [] # all topics topic : List[str] = [] # all topics


+ 2
- 2
MarketPlaces/MikesGrandStore/parser.py View File

@ -85,11 +85,11 @@ def mikesGrandStore_description_parser(soup: BeautifulSoup) -> Tuple:
return row return row
def mikesGtrandStore_listing_parser(soup: BeautifulSoup) -> List:
def mikesGrandStore_listing_parser(soup: BeautifulSoup) -> List:
# Fields to be parsed # Fields to be parsed
nm = 0 # Total_Products (Should be Integer) nm = 0 # Total_Products (Should be Integer)
mktName = "DarkFox" # 0 Marketplace_Name
mktName = "MikesGrandStore" # 0 Marketplace_Name
name = [] # 1 Product_Name name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security) MS = [] # 3 Product_MS_Classification (Microsoft Security)


+ 81
- 2
MarketPlaces/ThiefWorld/parser.py View File

@ -1,11 +1,11 @@
__author__ = 'DarkWeb' __author__ = 'DarkWeb'
# Here, we are importing the auxiliary functions to clean or convert data # Here, we are importing the auxiliary functions to clean or convert data
from typing import List
from typing import List, Tuple
from MarketPlaces.Utilities.utilities import * from MarketPlaces.Utilities.utilities import *
# Here, we are importing BeautifulSoup to search through the HTML tree # Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup, ResultSet, Tag
def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple: def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple:
@ -225,6 +225,85 @@ def darkfox_description_parser(soup):
return row return row
def thiefWorld_listing_parser(soup: BeautifulSoup):
# Fields to be parsed
nm = 0 # Total_Products (Should be Integer)
mktName = "ThiefWorld" # 0 Marketplace_Name
name = [] # 1 Product_Name
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = [] # 3 Product_MS_Classification (Microsoft Security)
category = [] # 4 Product_Category
describe = [] # 5 Product_Description
escrow = [] # 6 Vendor_Warranty
views = [] # 7 Product_Number_Of_Views
reviews = [] # 8 Product_Number_Of_Reviews
addDate = [] # 9 Product_AddDate
lastSeen = [] # 10 Product_LastViewDate
BTC = [] # 11 Product_BTC_SellingPrice
USD = [] # 12 Product_USD_SellingPrice
EURO = [] # 13 Product_EURO_SellingPrice
sold = [] # 14 Product_QuantitySold
qLeft =[] # 15 Product_QuantityLeft
shipFrom = [] # 16 Product_ShippedFrom
shipTo = [] # 17 Product_ShippedTo
vendor = [] # 18 Vendor
rating = [] # 19 Vendor_Rating
success = [] # 20 Vendor_Successful_Transactions
href = [] # 23 Product_Links (Urls)
productList: ResultSet[Tag] = soup.find_all('div', {'class': 'catalog_item'})
nm = len(productList)
for product in productList:
productTitle: Tag = product.find('div', {'class': 'title'}).find('a')
productName = cleanString(productTitle.text.strip())
name.append(productName)
productHref = productTitle.get('href')
href.append(productHref)
CVE.append('-1')
MS.append('-1')
category.append('-1')
productDescription = product.find('div', {'class': 'text'}).text
productDescription = cleanString(productDescription.strip())
describe.append(productDescription)
escrow.append('-1')
views.append('-1')
reviews.append('-1')
addDate.append('-1')
lastSeen.append('-1')
BTC.append('-1')
priceText = product.find('span', {'class': 'price'}).find('span').text
priceText = priceText.split('USD')[0]
priceText = cleanString(priceText.strip())
USD.append(priceText)
EURO.append('-1')
sold.append('-1')
qLeft.append('-1')
shipFrom.append('-1')
shipTo.append('-1')
productVendor = product.find('div', {'class': 'market over'}).find('a').text
productVendor = cleanString(productVendor.strip())
vendor.append(productVendor)
rating.append('-1')
success.append('-1')
return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen,
BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href)
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized #stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page #@param: soup object looking at html page of listing page


+ 31
- 0
test.py View File

@ -0,0 +1,31 @@
from dataclasses import dataclass
import datetime
from typing import Iterable, List
@dataclass
class ForumPost:
userName: str = '-1'
status: str = '-1'
reputation: str = '-1'
interest: str = '-1'
sign: str = '-1'
post: str = '-1'
feedback: str = '-1'
datePosted: datetime = datetime.datetime(1970,1,1)
@dataclass
class ForumTopic:
topicId: str
topicName: str
href: str
postList: List[ForumPost]
@dataclass
class ForumListing:
boardName: str
topicsNum: int
topicList: List[ForumTopic]

Loading…
Cancel
Save