Browse Source

fixed bugs with crawler not getting all pages and parser breaking

main
Joshua 1 year ago
parent
commit
abeb2c0df0
5 changed files with 206 additions and 30 deletions
  1. +147
    -0
      MarketPlaces/Initialization/geckodriver.log
  2. +1
    -1
      MarketPlaces/Initialization/marketsList.txt
  3. +8
    -8
      MarketPlaces/ViceCity/crawler_selenium.py
  4. +44
    -14
      MarketPlaces/ViceCity/parser.py
  5. +6
    -7
      setup.ini

+ 147
- 0
MarketPlaces/Initialization/geckodriver.log View File

@ -15532,3 +15532,150 @@ DevTools listening on ws://localhost:51081/devtools/browser/ef699bfb-b8a4-403a-a
1689136181511 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
1689305282590 geckodriver INFO Listening on 127.0.0.1:57612
1689305286344 mozrunner::runner INFO Running command: "C:\\Users\\John Wick\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "57613" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\JOHNWI~1\\AppData\\Local\\Temp\\rust_mozprofileW1wjHz"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689305287006 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:57613/devtools/browser/9cb995f7-f1d0-45e1-a9ae-0903f91679e2
1689305288403 Marionette INFO Listening on port 57618
1689305288510 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689305558621 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'.
1689305591430 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'.
1689305927779 Marionette INFO Stopped listening on port 57618
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
1689305927959 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689306152997 geckodriver INFO Listening on 127.0.0.1:62728
1689306156730 mozrunner::runner INFO Running command: "C:\\Users\\John Wick\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "62729" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\JOHNWI~1\\AppData\\Local\\Temp\\rust_mozprofileQfWfpc"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689306157335 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:62729/devtools/browser/90212f30-1413-403a-a4d6-85a9ad71de86
1689306158784 Marionette INFO Listening on port 62734
1689306158827 RemoteAgent WARN TLS certificate errors will be ignored for this session
1689306327168 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'.
1689306352097 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'.
1689306672567 Marionette INFO Stopped listening on port 62734
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
###!!! [Parent][MessageChannel] Error: (msgtype=0x390076,name=PContent::Msg_DestroyBrowsingContextGroup) Closed channel: cannot send/recv
###!!! [Child][MessageChannel] Error: (msgtype=0x23002E,name=PBrowser::Msg___delete__) Channel closing: too late to send/recv, messages will be lost
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1689306672742 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689353148352 geckodriver INFO Listening on 127.0.0.1:57720
1689353152386 mozrunner::runner INFO Running command: "C:\\Users\\John Wick\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "57721" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\JOHNWI~1\\AppData\\Local\\Temp\\rust_mozprofilebdVBHT"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689353153078 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:57721/devtools/browser/66aa6550-8450-49a2-be19-7728fc52cb65
1689353154754 Marionette INFO Listening on port 57726
1689353155234 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\John Wick\AppData\Local\Temp\rust_mozprofilebdVBHT\thumbnails) because it does not exist
1689353351388 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'.
1689353375169 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'.
1689353609409 Marionette INFO Stopped listening on port 57726
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
1689353609555 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138
1689358893192 geckodriver INFO Listening on 127.0.0.1:53304
1689358897088 mozrunner::runner INFO Running command: "C:\\Users\\John Wick\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "53305" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\JOHNWI~1\\AppData\\Local\\Temp\\rust_mozprofile5c9ZQ4"
console.log: "TorSettings: loadFromPrefs()"
console.log: "TorConnect: init()"
console.log: "TorConnect: Entering Initial state"
console.log: "TorConnect: Observed profile-after-change"
console.log: "TorConnect: Observing topic 'TorProcessExited'"
console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'"
console.log: "TorConnect: Observing topic 'torsettings:ready'"
console.log: "TorSettings: Observed profile-after-change"
1689358897866 Marionette INFO Marionette enabled
console.log: "TorConnect: Will load after bootstrap => [about:blank]"
console.error: "Could not load engine [email protected]: Error: Extension is invalid"
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined
DevTools listening on ws://localhost:53305/devtools/browser/2f57a39e-c4c4-4c89-af0b-cc8d26d8a863
1689358899540 Marionette INFO Listening on port 53310
1689358899767 RemoteAgent WARN TLS certificate errors will be ignored for this session
JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\John Wick\AppData\Local\Temp\rust_mozprofile5c9ZQ4\thumbnails) because it does not exist
1689359085260 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'.
1689359112369 Marionette WARN Ignoring event 'DOMContentLoaded' because document has an invalid readyState of 'complete'.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory.
1689360786237 Marionette INFO Stopped listening on port 53310
JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver]
!!! error running onStopped callback: TypeError: callback is not a function
###!!! [Child][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost
1689360786406 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64
unwatchForTabs()@TargetList.jsm:70
unwatchForTargets()@TargetList.jsm:37
destructor()@TargetList.jsm:109
stop()@CDP.jsm:104
close()@RemoteAgent.jsm:138

+ 1
- 1
MarketPlaces/Initialization/marketsList.txt View File

@ -1 +1 @@
M00nkeyMarket
ViceCity

+ 8
- 8
MarketPlaces/ViceCity/crawler_selenium.py View File

@ -46,7 +46,7 @@ def startCrawling():
# print(driver.current_url, e)
# closetor(driver)
new_parse(mktName, baseURL, True)
new_parse(mktName, baseURL, False)
# Opens Tor Browser
@ -189,15 +189,14 @@ def savePage(page, url):
# Gets the full path of the page to be saved along with its appropriate file name
#@param: raw url as crawler crawls through every site
def getFullPathName(url):
from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE
mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages")
fileName = getNameFromURL(url)
if isDescriptionLink(url):
fullPath = r'..\ViceCity\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
else:
fullPath = r'..\ViceCity\HTML_Pages\\' + str(
"%02d" % date.today().month) + str("%02d" % date.today().day) + str(
"%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html'
fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
return fullPath
@ -266,10 +265,11 @@ def crawlForum(driver):
driver.refresh()
time.sleep(2.5) # to let page catchup
savePage(driver.page_source, item)
time.sleep(2.5) # so site doesnt crash
driver.back()
#comment out
break
# break
# # comment out
# if count == 1:


+ 44
- 14
MarketPlaces/ViceCity/parser.py View File

@ -1,5 +1,7 @@
__author__ = 'DarkWeb'
import re
# Here, we are importing the auxiliary functions to clean or convert data
from MarketPlaces.Utilities.utilities import *
@ -39,11 +41,16 @@ def vicecity_description_parser(soup):
name = name.strip()
# Finding Vendor
vendor = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}).find('a').text.strip()
vendor = soup.find('div', {'class': "listing_info"})
vendor = vendor.find('div', {'class': "listing_right"})
numbers = vendor.find('a').find('span').text
vendor = vendor.find('a').text
vendor = vendor.replace(numbers, "").strip() # removes numbers at the end of vendor name
# Finding Vendor Rating
rating = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}).find('a').get('title')
rating = str(re.match(r"\d+%", rating)).strip()
rating = re.search(r"\d+%", rating)
rating_vendor = rating.group(0).strip()
# Finding Quantity Sold and Left
# temp = mb[4].text.split(',')
@ -56,28 +63,31 @@ def vicecity_description_parser(soup):
# Finding Successful Transactions
success = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"}).find('a').get('title')
success = str(re.compile(r"\d+(?= sales)", success)).strip()
success = re.search(r"\d+(?= sales)", success)
success = success.group(0).strip()
bae = soup.find('pre')
# Finding USD
USD = bae.find('span').text
USD = str(re.compile(r"\$\d+(?:\.\d+)?", USD))
USD = re.search(r"\$\d+(?:\.\d+)?", USD).group(0)
USD = USD.replace("$", "").strip()
# Finding BTC
BTC = bae.findall('span')
BTC = str(re.compile(r"\d+(?:\.\d+)?", BTC[1].text)).strip()
BTC = bae.find_all('span')
BTC = re.search(r"\d+(?:\.\d+)?", BTC[1].text).group(0)
BTC = BTC.strip()
# Finding the Product Category
category = soup.find('div', {'class': "listing_info"}).find('div', {'class': "listing_right"})
category = category.find('span', {'style': "font-size:15px;color: #a1a1a1"}).text
category = category.replace("Category:", "").strip()
li = bae.find('span', {'style': "float:right"}).find_all('span')
li = bae.find_all('span')
# Finding Shipment Information (Origin)
shipFrom = li[1].text.strip()
shipFrom = li[-4].text.strip()
# Finding Shipment Information (Destination)
shipTo = li[-2].text.strip()
@ -91,7 +101,11 @@ def vicecity_description_parser(soup):
# Finding the Number of Product Reviews
li = soup.find_all('label', {'class': "tc_label threetabs"})
review = li[1].text
review = str(re.compile(r"\d+", review)).strip()
review = re.search(r"\d+", review)
if review:
reviews = review.group(0).strip()
else:
reviews = '0'
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
@ -145,7 +159,7 @@ def vicecity_listing_parser(soup):
shipTo = [] # 19 Product_ShippedTo
href = [] # 20 Product_Links
listing = soup.find('div', {"class": "frontpage"}).findAll('div', {"class": "wLf"})
listing = soup.findAll('div', {"class": "wLf"}) # should be 30
# Populating the Number of Products
nm = len(listing)
@ -177,7 +191,9 @@ def vicecity_listing_parser(soup):
# Finding the Vendor
vendor_name = a.find('div', {"class": "wLfVendor"}).find('a').text
addedNums = a.find('div', {"class": "wLfVendor"}).find('a').find('span').text # finds numbers added at end
vendor_name = vendor_name.replace(",", "")
vendor_name = vendor_name.replace(addedNums, "") # removes numbers added at end
vendor_name = vendor_name.strip()
vendor.append(vendor_name)
@ -185,11 +201,12 @@ def vicecity_listing_parser(soup):
price = a.find('div', {"class": "wLfPrice"}).find_all('span')
ud = price[0].text.replace(" USD", " ")
# u = ud.replace("$","")
u = ud.replace(",", "")
ud = ud.replace(",", "")
u = ud.replace(price[1].text, "")
u = u.strip()
USD.append(u)
bc = price[1].text
bc = str(re.compile(r"\d+(?:\.\d+)?", bc))
bc = re.search(r"\d+(?:\.\d+)?", bc).group(0).strip()
BTC.append(bc)
# # Finding Reviews
@ -202,10 +219,23 @@ def vicecity_listing_parser(soup):
# Finding Successful Transactions
freq = a.find('div', {"class": "wLfVendor"}).find('a').get('title')
freq = re.compile(r'\d+(?= sales)', freq)
freq = re.search(r'\d+(?= sales)', freq).group(0)
freq = freq.strip()
success.append(freq)
# Finding Ship from and ship to
place = a.find('div', {"class": "wLfPrice"})
place = place.find('span', {'style': "font-size: 12px;"}).text
place = place.split('')
varFrom = place[0].strip()
varTo = place[1].strip()
if varFrom == "WW":
varFrom = "Worldwide"
if varTo == "WW":
varTo = "Worldwide"
shipFrom.append(varFrom)
shipTo.append(varTo)
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
@ -242,7 +272,7 @@ def vicecity_links_parser(soup):
# Returning all links that should be visited by the Crawler
href = []
listing = soup.find('div', {"class": "frontpage"}).findAll('div', {"class": "wLf"})
listing = soup.findAll('div', {"class": "wLf"})
for a in listing:
bae = a.find('div', {"class": "wLfLeft"}).find('a', href=True)


+ 6
- 7
setup.ini View File

@ -1,15 +1,14 @@
[TOR]
firefox_binary_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe
firefox_profile_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Data\\Browser\\profile.default
geckodriver_path = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test\\selenium\\geckodriver.exe
firefox_binary_path = C:\Users\John Wick\Desktop\Tor Browser\Browser\firefox.exe
firefox_profile_path = C:\Users\John Wick\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default
geckodriver_path = C:\Users\John Wick\PycharmProjects\dw_pipeline_test\selenium\geckodriver.exe
[Project]
project_directory = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test
shared_folder = \\VBoxSvr\\Shared
project_directory = C:\Users\John Wick\PycharmProjects\dw_pipeline_test
shared_folder = Z:\\VBoxSvr\\VM_Files_ (shared)
[PostgreSQL]
ip = localhost
username = postgres
password = password
password = postgres
database = darkweb_markets_forums

Loading…
Cancel
Save