From 594e52949b815e713fe19d347bae712e534f03bd Mon Sep 17 00:00:00 2001 From: Joshua Date: Thu, 27 Jul 2023 10:28:58 -0700 Subject: [PATCH] Working on Parser. Finished description parser. --- MarketPlaces/HiddenMarket/crawler_selenium.py | 30 ++--- MarketPlaces/HiddenMarket/parser.py | 112 +++++++++++------- MarketPlaces/Initialization/geckodriver.log | 71 +++++++++++ 3 files changed, 157 insertions(+), 56 deletions(-) diff --git a/MarketPlaces/HiddenMarket/crawler_selenium.py b/MarketPlaces/HiddenMarket/crawler_selenium.py index 714280f..2582967 100644 --- a/MarketPlaces/HiddenMarket/crawler_selenium.py +++ b/MarketPlaces/HiddenMarket/crawler_selenium.py @@ -29,19 +29,19 @@ baseURL = 'http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion # Opens Tor Browser, crawls the website def startCrawling(): - opentor() + # opentor() marketName = getMKTName() - driver = getAccess() + # driver = getAccess() - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) - # new_parse(marketName, baseURL, False) + new_parse(marketName, baseURL, False) # Opens Tor Browser @@ -211,11 +211,11 @@ def getInterestedLinks(): links = [] # # Civil Software - # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/civil_softwares') + links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/civil_softwares') # # Tutorials - Carding # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/carding') # # Digital - Hacks - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/hacks') + # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/hacks') # Digital - Exploit Kit # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/exploit_kit') # # 0Day @@ -275,11 +275,11 @@ def crawlForum(driver): driver.back() # comment out - break + # break # comment out - # if count == 1: - # break + if count == 2: + break try: pageCount += 1 diff --git a/MarketPlaces/HiddenMarket/parser.py b/MarketPlaces/HiddenMarket/parser.py index ec108d9..c75e97e 100644 --- a/MarketPlaces/HiddenMarket/parser.py +++ b/MarketPlaces/HiddenMarket/parser.py @@ -31,20 +31,19 @@ def hiddenmarket_description_parser(soup): shipFrom = "-1" # 17 Product_ShippedFrom shipTo = "-1" # 18 Product_ShippedTo - bae = soup.find('div', {'class': "col-9"}) + bae = soup.find('div', {'class': "main"}) # Finding Product Name - name = bae.find('h2').text + name = bae.find('div', {'class': "heading"}).text name = name.replace('\n', ' ') name = name.replace(",", "") name = name.strip() - mb = bae.findAll('div', {"class": "mb-1"}) + mb = bae.find('div', {'class': "information"}).findAll('tr') # Finding Vendor - vendor = mb[0].text + vendor = mb[1].find('a').text vendor = vendor.replace(",", "") - vendor = vendor.replace("Sold by:", "") vendor = vendor.strip() # # Finding Vendor Rating @@ -52,41 +51,50 @@ def hiddenmarket_description_parser(soup): # half_star = bae[2].find('i', {'class': "fas fa-star-half-alt"}) # rating = len(full_stars) + (0.5 if half_star is not None else 0) - # Finding Quantity Sold and Left - temp = mb[4].text.split(',') - - sold = temp[0].replace("sold", "") - sold = sold.strip() - - left = temp[1].replace("in stock", "") + # Finding Quantity Left + temp = mb[-3].text + left = temp.replace("Quantity in stock:", "") left = left.strip() # Finding USD - USD = bae.find('div', {"class": "h3 text-secondary"}).text - USD = USD.replace("$", "") + USD = mb[0].text + USD = USD.replace("Price:", "") + USD = USD.replace("USD", "") USD = USD.strip() # Finding BTC - temp = bae.find('div', {"class": "small"}).text.split("BTC") + # temp = bae.find('div', {"class": "small"}).text.split("BTC") - BTC = temp[0].strip() + # BTC = temp[0].strip() - # shipping_info = bae[4].text - # if "Digital" not in shipping_info: - # shipping_info = shipping_info.split(" ") - # - # # Finding Shipment Information (Origin) - # shipFrom = shipping_info[0].strip() - # - # # Finding Shipment Information (Destination) - # shipTo = shipping_info[1].strip() + # Finding Shipment Information (Origin) + shipFrom = mb[2].text + shipFrom = shipFrom.replace("Seller location:", "") + shipFrom = shipFrom.strip() + + # Finding Shipment Information (Destination) + shipTo = mb[3].text + shipTo = shipTo.replace("Ships to (seller):", "") + shipTo = shipTo.strip() # Finding the Product description - describe = bae.find('div', {"class": "card border-top-0"}).text + describe = bae.find('div', {"class": "twotabs"}).find('div', {'class': "tab1"}).text describe = describe.replace("\n", " ") describe = describe.replace("\r", " ") + describe = describe.replace("-", " ") describe = describe.strip() + # Finding the Product Category + category = mb[-4].text + category = category.replace("Category:", "") + category = category.strip() + + #Finding the number of reviews + reviews = bae.find_all('div', {'class': "heading"}) + reviews = reviews[-2].text + reviews = reviews.replace("Comments (", "") + reviews = reviews.replace(")", "") + # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if cve: @@ -139,54 +147,76 @@ def hiddenmarket_listing_parser(soup): shipTo = [] # 19 Product_ShippedTo href = [] # 20 Product_Links - listing = soup.findAll('div', {"class": "card product-card mb-3"}) + listing = soup.findAll('div', {"class": "info"}) # Populating the Number of Products nm = len(listing) # Finding Category - cat = soup.find("div", {"class": "col-9"}) - cat = cat.find("h2").text - cat = cat.replace("Category: ", "") + cat = soup.find("div", {'class': "heading"}).text cat = cat.replace(",", "") cat = cat.strip() for card in listing: category.append(cat) - bae = card.findAll('a') - # Adding the url to the list of urls - link = bae[0].get('href') + # Adding the url to the list of urls TODO: fix this + link = card.next_sibling + link.find('a').get('href') href.append(link) # Finding Product Name - product = bae[1].text + product = card.next_sibling.find('div', {'class': "title"}).find('a').text product = product.replace('\n', ' ') product = product.replace(",", "") product = product.strip() name.append(product) # Finding Vendor - vendor_name = bae[2].text + vendor_name = card.text vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.strip() vendor.append(vendor_name) # Finding USD - usd = card.find('div', {"class": "mb-1"}).text - usd = usd.replace("$", "") + usd = card.next_sibling.find('div', {"class": "buttons"}).find('div', {'class': "price"}).text + usd = usd.replace("USD", "") usd = usd.strip() USD.append(usd) + tb = card.next_sibling.find("span", {"class": "stats"}).find_all('td') + # Finding Reviews - num = card.find("span", {"class": "rate-count"}).text - num = num.replace("(", "") - num = num.replace("review)", "") - num = num.replace("reviews)", "") + num = tb[-1].text num = num.strip() reviews.append(num) + # Finding Views + view = tb[0].text.strip() + views.append(view) + + # Finding Num of Sales + sale = tb[1].text.strip() + sold.append(sale) + + # Finding shipping info + shipping = card.next_sibling.find('div', {'class': "shipping"}).text.split('>') + # SHip from + origin = shipping[0].strip() + shipFrom.append(origin) + #Ship to + destination = shipping[1].strip() + shipTo.append(destination) + + # Finding description + description = card.next_sibling.find('div', {'class': "description"}).text + description = description.replace("\n", " ") + description = description.replace("\r", " ") + description = description.replace("-", " ") + description = description.strip() + describe.append(description) + # Searching for CVE and MS categories cve = card.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: diff --git a/MarketPlaces/Initialization/geckodriver.log b/MarketPlaces/Initialization/geckodriver.log index 2ec7892..77a0a28 100644 --- a/MarketPlaces/Initialization/geckodriver.log +++ b/MarketPlaces/Initialization/geckodriver.log @@ -16982,3 +16982,74 @@ unwatchForTargets()@TargetList.jsm:37 destructor()@TargetList.jsm:109 stop()@CDP.jsm:104 close()@RemoteAgent.jsm:138 +1690471312083 geckodriver INFO Listening on 127.0.0.1:51018 +1690471316041 mozrunner::runner INFO Running command: "C:\\Users\\John Wick\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51019" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\JOHNWI~1\\AppData\\Local\\Temp\\rust_mozprofileOvVZJJ" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1690471316828 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:51019/devtools/browser/3db2bc14-2f5c-482e-9367-8dbec91f64d6 +1690471318449 Marionette INFO Listening on port 51024 +1690471318718 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\John Wick\AppData\Local\Temp\rust_mozprofileOvVZJJ\thumbnails) because it does not exist +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +1690471691471 Marionette INFO Stopped listening on port 51024 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1690471691624 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1690471761742 geckodriver INFO Listening on 127.0.0.1:51209 +1690471765842 mozrunner::runner INFO Running command: "C:\\Users\\John Wick\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "51210" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\JOHNWI~1\\AppData\\Local\\Temp\\rust_mozprofileKIHjMA" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1690471766480 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:51210/devtools/browser/463ff815-81da-4d95-b1c2-fd70ce5d9152 +1690471768298 Marionette INFO Listening on port 51215 +1690471768535 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +1690472208303 Marionette INFO Stopped listening on port 51215 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1690472208478 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138