diff --git a/MarketPlaces/Initialization/geckodriver.log b/MarketPlaces/Initialization/geckodriver.log index 8b61c4e..c45f256 100644 --- a/MarketPlaces/Initialization/geckodriver.log +++ b/MarketPlaces/Initialization/geckodriver.log @@ -15532,3 +15532,150 @@ DevTools listening on ws://localhost:51081/devtools/browser/ef699bfb-b8a4-403a-a 1689136181511 RemoteAgent WARN TLS certificate errors will be ignored for this session JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +1689305282590 geckodriver INFO Listening on 127.0.0.1:57612 +1689305286344 mozrunner::runner INFO Running command: "C:\\Users\\John Wick\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "57613" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\JOHNWI~1\\AppData\\Local\\Temp\\rust_mozprofileW1wjHz" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689305287006 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:57613/devtools/browser/9cb995f7-f1d0-45e1-a9ae-0903f91679e2 +1689305288403 Marionette INFO Listening on port 57618 +1689305288510 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689305558621 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'. +1689305591430 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'. +1689305927779 Marionette INFO Stopped listening on port 57618 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +1689305927959 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689306152997 geckodriver INFO Listening on 127.0.0.1:62728 +1689306156730 mozrunner::runner INFO Running command: "C:\\Users\\John Wick\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "62729" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\JOHNWI~1\\AppData\\Local\\Temp\\rust_mozprofileQfWfpc" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689306157335 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:62729/devtools/browser/90212f30-1413-403a-a4d6-85a9ad71de86 +1689306158784 Marionette INFO Listening on port 62734 +1689306158827 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689306327168 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'. +1689306352097 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'. +1689306672567 Marionette INFO Stopped listening on port 62734 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function + +###!!! [Parent][MessageChannel] Error: (msgtype=0x390076,name=PContent::Msg_DestroyBrowsingContextGroup) Closed channel: cannot send/recv + + +###!!! [Child][MessageChannel] Error: (msgtype=0x23002E,name=PBrowser::Msg___delete__) Channel closing: too late to send/recv, messages will be lost + + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1689306672742 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689353148352 geckodriver INFO Listening on 127.0.0.1:57720 +1689353152386 mozrunner::runner INFO Running command: "C:\\Users\\John Wick\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "57721" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\JOHNWI~1\\AppData\\Local\\Temp\\rust_mozprofilebdVBHT" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689353153078 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:57721/devtools/browser/66aa6550-8450-49a2-be19-7728fc52cb65 +1689353154754 Marionette INFO Listening on port 57726 +1689353155234 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\John Wick\AppData\Local\Temp\rust_mozprofilebdVBHT\thumbnails) because it does not exist +1689353351388 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'. +1689353375169 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'. +1689353609409 Marionette INFO Stopped listening on port 57726 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +1689353609555 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689358893192 geckodriver INFO Listening on 127.0.0.1:53304 +1689358897088 mozrunner::runner INFO Running command: "C:\\Users\\John Wick\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "53305" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\JOHNWI~1\\AppData\\Local\\Temp\\rust_mozprofile5c9ZQ4" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689358897866 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:53305/devtools/browser/2f57a39e-c4c4-4c89-af0b-cc8d26d8a863 +1689358899540 Marionette INFO Listening on port 53310 +1689358899767 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\John Wick\AppData\Local\Temp\rust_mozprofile5c9ZQ4\thumbnails) because it does not exist +1689359085260 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'. +1689359112369 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +1689360786237 Marionette INFO Stopped listening on port 53310 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function + +###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1689360786406 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 19c6d10..a427446 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -M00nkeyMarket +ViceCity diff --git a/MarketPlaces/ViceCity/crawler_selenium.py b/MarketPlaces/ViceCity/crawler_selenium.py index 0b22082..584a90c 100644 --- a/MarketPlaces/ViceCity/crawler_selenium.py +++ b/MarketPlaces/ViceCity/crawler_selenium.py @@ -46,7 +46,7 @@ def startCrawling(): # print(driver.current_url, e) # closetor(driver) - new_parse(mktName, baseURL, True) + new_parse(mktName, baseURL, False) # Opens Tor Browser @@ -189,15 +189,14 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\ViceCity\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\ViceCity\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath @@ -266,10 +265,11 @@ def crawlForum(driver): driver.refresh() time.sleep(2.5) # to let page catchup savePage(driver.page_source, item) + time.sleep(2.5) # so site doesnt crash driver.back() #comment out - break + # break # # comment out # if count == 1: diff --git a/MarketPlaces/ViceCity/parser.py b/MarketPlaces/ViceCity/parser.py index 65d6b8f..eeb97a3 100644 --- a/MarketPlaces/ViceCity/parser.py +++ b/MarketPlaces/ViceCity/parser.py @@ -1,5 +1,7 @@ __author__ = 'DarkWeb' +import re + # Here, we are importing the auxiliary functions to clean or convert data from MarketPlaces.Utilities.utilities import * @@ -39,11 +41,16 @@ def vicecity_description_parser(soup): name = name.strip() # Finding Vendor - vendor = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}).find('a').text.strip() + vendor = soup.find('div', {'class': "listing_info"}) + vendor = vendor.find('div', {'class': "listing_right"}) + numbers = vendor.find('a').find('span').text + vendor = vendor.find('a').text + vendor = vendor.replace(numbers, "").strip() # removes numbers at the end of vendor name # Finding Vendor Rating rating = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}).find('a').get('title') - rating = str(re.match(r"\d+%", rating)).strip() + rating = re.search(r"\d+%", rating) + rating_vendor = rating.group(0).strip() # Finding Quantity Sold and Left # temp = mb[4].text.split(',') @@ -56,28 +63,31 @@ def vicecity_description_parser(soup): # Finding Successful Transactions success = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}).find('a').get('title') - success = str(re.compile(r"\d+(?= sales)", success)).strip() + success = re.search(r"\d+(?= sales)", success) + success = success.group(0).strip() + bae = soup.find('pre') # Finding USD USD = bae.find('span').text - USD = str(re.compile(r"\$\d+(?:\.\d+)?", USD)) + USD = re.search(r"\$\d+(?:\.\d+)?", USD).group(0) USD = USD.replace("$", "").strip() # Finding BTC - BTC = bae.findall('span') - BTC = str(re.compile(r"\d+(?:\.\d+)?", BTC[1].text)).strip() + BTC = bae.find_all('span') + BTC = re.search(r"\d+(?:\.\d+)?", BTC[1].text).group(0) + BTC = BTC.strip() # Finding the Product Category category = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}) category = category.find('span', {'style': "font-size:15px;color: #a1a1a1"}).text category = category.replace("Category:", "").strip() - li = bae.find('span', {'style': "float:right"}).find_all('span') + li = bae.find_all('span') # Finding Shipment Information (Origin) - shipFrom = li[1].text.strip() + shipFrom = li[-4].text.strip() # Finding Shipment Information (Destination) shipTo = li[-2].text.strip() @@ -91,7 +101,11 @@ def vicecity_description_parser(soup): # Finding the Number of Product Reviews li = soup.find_all('label', {'class': "tc_label threetabs"}) review = li[1].text - review = str(re.compile(r"\d+", review)).strip() + review = re.search(r"\d+", review) + if review: + reviews = review.group(0).strip() + else: + reviews = '0' # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -145,7 +159,7 @@ def vicecity_listing_parser(soup): shipTo = [] # 19 Product_ShippedTo href = [] # 20 Product_Links - listing = soup.find('div', {"class": "frontpage"}).findAll('div', {"class": "wLf"}) + listing = soup.findAll('div', {"class": "wLf"}) # should be 30 # Populating the Number of Products nm = len(listing) @@ -177,7 +191,9 @@ def vicecity_listing_parser(soup): # Finding the Vendor vendor_name = a.find('div', {"class": "wLfVendor"}).find('a').text + addedNums = a.find('div', {"class": "wLfVendor"}).find('a').find('span').text # finds numbers added at end vendor_name = vendor_name.replace(",", "") + vendor_name = vendor_name.replace(addedNums, "") # removes numbers added at end vendor_name = vendor_name.strip() vendor.append(vendor_name) @@ -185,11 +201,12 @@ def vicecity_listing_parser(soup): price = a.find('div', {"class": "wLfPrice"}).find_all('span') ud = price[0].text.replace(" USD", " ") # u = ud.replace("$","") - u = ud.replace(",", "") + ud = ud.replace(",", "") + u = ud.replace(price[1].text, "") u = u.strip() USD.append(u) bc = price[1].text - bc = str(re.compile(r"\d+(?:\.\d+)?", bc)) + bc = re.search(r"\d+(?:\.\d+)?", bc).group(0).strip() BTC.append(bc) # # Finding Reviews @@ -202,10 +219,23 @@ def vicecity_listing_parser(soup): # Finding Successful Transactions freq = a.find('div', {"class": "wLfVendor"}).find('a').get('title') - freq = re.compile(r'\d+(?= sales)', freq) + freq = re.search(r'\d+(?= sales)', freq).group(0) freq = freq.strip() success.append(freq) + # Finding Ship from and ship to + place = a.find('div', {"class": "wLfPrice"}) + place = place.find('span', {'style': "font-size: 12px;"}).text + place = place.split('⟶') + varFrom = place[0].strip() + varTo = place[1].strip() + if varFrom == "WW": + varFrom = "Worldwide" + if varTo == "WW": + varTo = "Worldwide" + shipFrom.append(varFrom) + shipTo.append(varTo) + # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: @@ -242,7 +272,7 @@ def vicecity_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - listing = soup.find('div', {"class": "frontpage"}).findAll('div', {"class": "wLf"}) + listing = soup.findAll('div', {"class": "wLf"}) for a in listing: bae = a.find('div', {"class": "wLfLeft"}).find('a', href=True) diff --git a/setup.ini b/setup.ini index 641d3f1..c87990a 100644 --- a/setup.ini +++ b/setup.ini @@ -1,15 +1,14 @@ - [TOR] -firefox_binary_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe -firefox_profile_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Data\\Browser\\profile.default -geckodriver_path = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test\\selenium\\geckodriver.exe +firefox_binary_path = C:\Users\John Wick\Desktop\Tor Browser\Browser\firefox.exe +firefox_profile_path = C:\Users\John Wick\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default +geckodriver_path = C:\Users\John Wick\PycharmProjects\dw_pipeline_test\selenium\geckodriver.exe [Project] -project_directory = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test -shared_folder = \\VBoxSvr\\Shared +project_directory = C:\Users\John Wick\PycharmProjects\dw_pipeline_test +shared_folder = Z:\\VBoxSvr\\VM_Files_ (shared) [PostgreSQL] ip = localhost username = postgres -password = password +password = postgres database = darkweb_markets_forums \ No newline at end of file