From c09298652ab3c97fb5c831a6dd11fd2c7fff23cc Mon Sep 17 00:00:00 2001 From: Khoi <110973167+marked01one@users.noreply.github.com> Date: Thu, 29 Jun 2023 10:35:16 -0700 Subject: [PATCH] Created test file for class-based organization for parser data --- Forums/Initialization/forumsList.txt | 2 +- Forums/Initialization/forums_mining.py | 2 + Forums/OnniForums/parser.py | 4 +- MarketPlaces/MikesGrandStore/parser.py | 4 +- MarketPlaces/ThiefWorld/parser.py | 83 +++++++++++++++++++++++++- test.py | 31 ++++++++++ 6 files changed, 119 insertions(+), 7 deletions(-) create mode 100644 test.py diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt index 3b57198..f1320f9 100644 --- a/Forums/Initialization/forumsList.txt +++ b/Forums/Initialization/forumsList.txt @@ -1 +1 @@ -OnniForums +OnniForums \ No newline at end of file diff --git a/Forums/Initialization/forums_mining.py b/Forums/Initialization/forums_mining.py index 5e8aa92..a75b2c1 100644 --- a/Forums/Initialization/forums_mining.py +++ b/Forums/Initialization/forums_mining.py @@ -114,6 +114,8 @@ if __name__ == '__main__': elif forum == "HiddenAnswers": crawlerHiddenAnswers() + + diff --git a/Forums/OnniForums/parser.py b/Forums/OnniForums/parser.py index 1e221a7..20d8802 100644 --- a/Forums/OnniForums/parser.py +++ b/Forums/OnniForums/parser.py @@ -115,8 +115,8 @@ def onniForums_description_parser(soup: BeautifulSoup) -> tuple: def onniForums_listing_parser(soup: BeautifulSoup): - boardName = "-1" # board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + boardName = "-1" # board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) nm = 0 # this variable should receive the number of topics topic : List[str] = [] # all topics diff --git a/MarketPlaces/MikesGrandStore/parser.py b/MarketPlaces/MikesGrandStore/parser.py index 6a24815..fe9bd61 100644 --- a/MarketPlaces/MikesGrandStore/parser.py +++ b/MarketPlaces/MikesGrandStore/parser.py @@ -85,11 +85,11 @@ def mikesGrandStore_description_parser(soup: BeautifulSoup) -> Tuple: return row -def mikesGtrandStore_listing_parser(soup: BeautifulSoup) -> List: +def mikesGrandStore_listing_parser(soup: BeautifulSoup) -> List: # Fields to be parsed nm = 0 # Total_Products (Should be Integer) - mktName = "DarkFox" # 0 Marketplace_Name + mktName = "MikesGrandStore" # 0 Marketplace_Name name = [] # 1 Product_Name CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = [] # 3 Product_MS_Classification (Microsoft Security) diff --git a/MarketPlaces/ThiefWorld/parser.py b/MarketPlaces/ThiefWorld/parser.py index f091b53..92e4214 100644 --- a/MarketPlaces/ThiefWorld/parser.py +++ b/MarketPlaces/ThiefWorld/parser.py @@ -1,11 +1,11 @@ __author__ = 'DarkWeb' # Here, we are importing the auxiliary functions to clean or convert data -from typing import List +from typing import List, Tuple from MarketPlaces.Utilities.utilities import * # Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup, Tag +from bs4 import BeautifulSoup, ResultSet, Tag def thiefWorld_description_parser(soup: BeautifulSoup) -> Tuple: @@ -225,6 +225,85 @@ def darkfox_description_parser(soup): return row +def thiefWorld_listing_parser(soup: BeautifulSoup): + + # Fields to be parsed + nm = 0 # Total_Products (Should be Integer) + mktName = "ThiefWorld" # 0 Marketplace_Name + name = [] # 1 Product_Name + CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = [] # 3 Product_MS_Classification (Microsoft Security) + category = [] # 4 Product_Category + describe = [] # 5 Product_Description + escrow = [] # 6 Vendor_Warranty + views = [] # 7 Product_Number_Of_Views + reviews = [] # 8 Product_Number_Of_Reviews + addDate = [] # 9 Product_AddDate + lastSeen = [] # 10 Product_LastViewDate + BTC = [] # 11 Product_BTC_SellingPrice + USD = [] # 12 Product_USD_SellingPrice + EURO = [] # 13 Product_EURO_SellingPrice + sold = [] # 14 Product_QuantitySold + qLeft =[] # 15 Product_QuantityLeft + shipFrom = [] # 16 Product_ShippedFrom + shipTo = [] # 17 Product_ShippedTo + vendor = [] # 18 Vendor + rating = [] # 19 Vendor_Rating + success = [] # 20 Vendor_Successful_Transactions + href = [] # 23 Product_Links (Urls) + + productList: ResultSet[Tag] = soup.find_all('div', {'class': 'catalog_item'}) + + nm = len(productList) + + for product in productList: + + productTitle: Tag = product.find('div', {'class': 'title'}).find('a') + + productName = cleanString(productTitle.text.strip()) + name.append(productName) + + productHref = productTitle.get('href') + href.append(productHref) + + CVE.append('-1') + MS.append('-1') + category.append('-1') + + productDescription = product.find('div', {'class': 'text'}).text + productDescription = cleanString(productDescription.strip()) + describe.append(productDescription) + + escrow.append('-1') + views.append('-1') + reviews.append('-1') + addDate.append('-1') + lastSeen.append('-1') + BTC.append('-1') + + priceText = product.find('span', {'class': 'price'}).find('span').text + priceText = priceText.split('USD')[0] + priceText = cleanString(priceText.strip()) + USD.append(priceText) + + EURO.append('-1') + sold.append('-1') + qLeft.append('-1') + shipFrom.append('-1') + shipTo.append('-1') + + productVendor = product.find('div', {'class': 'market over'}).find('a').text + productVendor = cleanString(productVendor.strip()) + vendor.append(productVendor) + + rating.append('-1') + success.append('-1') + + + return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, + BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) + + #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of listing page diff --git a/test.py b/test.py new file mode 100644 index 0000000..53043c5 --- /dev/null +++ b/test.py @@ -0,0 +1,31 @@ +from dataclasses import dataclass +import datetime +from typing import Iterable, List + + +@dataclass +class ForumPost: + userName: str = '-1' + status: str = '-1' + reputation: str = '-1' + interest: str = '-1' + sign: str = '-1' + post: str = '-1' + feedback: str = '-1' + datePosted: datetime = datetime.datetime(1970,1,1) + + +@dataclass +class ForumTopic: + topicId: str + topicName: str + href: str + postList: List[ForumPost] + + +@dataclass +class ForumListing: + boardName: str + topicsNum: int + topicList: List[ForumTopic] +