Browse Source

completed lionmarketplace scraper

main
Helium 1 year ago
parent
commit
bcf6ed0b7b
5 changed files with 257 additions and 265 deletions
  1. +1
    -1
      MarketPlaces/Initialization/marketsList.txt
  2. +5
    -0
      MarketPlaces/Initialization/prepare_parser.py
  3. +14
    -22
      MarketPlaces/LionMarketplace/crawler_selenium.py
  4. +233
    -238
      MarketPlaces/LionMarketplace/parser.py
  5. +4
    -4
      setup.ini

+ 1
- 1
MarketPlaces/Initialization/marketsList.txt View File

@ -1 +1 @@
DigitalThriftShop LionMarketplace

+ 5
- 0
MarketPlaces/Initialization/prepare_parser.py View File

@ -15,6 +15,7 @@ from MarketPlaces.TorBay.parser import *
from MarketPlaces.M00nkeyMarket.parser import * from MarketPlaces.M00nkeyMarket.parser import *
from MarketPlaces.DarkMatter.parser import * from MarketPlaces.DarkMatter.parser import *
from MarketPlaces.DigitalThriftShop.parser import * from MarketPlaces.DigitalThriftShop.parser import *
from MarketPlaces.LionMarketplace.parser import *
from MarketPlaces.Classifier.classify_product import predict from MarketPlaces.Classifier.classify_product import predict
@ -163,6 +164,8 @@ def new_parse(marketPlace, url, createLog):
rmm = darkmatter_description_parser(soup) rmm = darkmatter_description_parser(soup)
elif marketPlace == "DigitalThriftShop": elif marketPlace == "DigitalThriftShop":
rmm = digitalThriftShop_description_parser(soup) rmm = digitalThriftShop_description_parser(soup)
elif marketPlace == "LionMarketplace":
rmm = lionmarketplace_description_parser(soup)
# key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2] # key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2]
key = u"Url:" + os.path.basename(line2).replace(".html", "") key = u"Url:" + os.path.basename(line2).replace(".html", "")
@ -227,6 +230,8 @@ def new_parse(marketPlace, url, createLog):
rw = darkmatter_listing_parser(soup) rw = darkmatter_listing_parser(soup)
elif marketPlace == "DigitalThriftShop": elif marketPlace == "DigitalThriftShop":
rw = digitalThriftShop_listing_parser(soup) rw = digitalThriftShop_listing_parser(soup)
elif marketPlace == "LionMarketplace":
rw = lionmarketplace_listing_parser(soup)
else: else:
parseError = True parseError = True


+ 14
- 22
MarketPlaces/LionMarketplace/crawler_selenium.py View File

@ -32,7 +32,7 @@ baseURL = 'http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion
#acts like the main method for the crawler, another function at the end of this code calls this function later #acts like the main method for the crawler, another function at the end of this code calls this function later
def startCrawling(): def startCrawling():
opentor() opentor()
# mktName = getMKTName() mktName = getMKTName()
driver = getAccess() driver = getAccess()
if driver != 'down': if driver != 'down':
@ -43,7 +43,7 @@ def startCrawling():
print(driver.current_url, e) print(driver.current_url, e)
closetor(driver) closetor(driver)
# new_parse(forumName, baseURL, False) new_parse(mktName, baseURL, False)
# Opens Tor Browser # Opens Tor Browser
@ -187,20 +187,12 @@ def getInterestedLinks():
# Software/Malware # Software/Malware
links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/16') links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/16')
# # Carding # Carding
# links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/20') links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/20')
# # Hacker for hire # Hacking
# links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/0b19f3a0-c7e8-11ec-997b-0dcb6b05ce1d') links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/ba142ac0-c7e7-11ec-9bd1-fdd89c3d3f91')
# # Phishing # tutorial
# links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/18098bb0-c7e8-11ec-95e9-45b5e8898cbd') links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/19')
# # Ransomware
# links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/ce72cee0-c7e7-11ec-a86b-c1ff2d3b2020')
# # Exploits
# links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/e26387c0-c7e7-11ec-a708-ab6dc5117763')
# # Spamming and Anti-Captcha
# links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/f08a9380-c7e7-11ec-918c-ffef7c670c97')
# hacked accounts
#links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/fd47b4a0-c7e7-11ec-937b-61246c4b12b3')
return links return links
@ -239,12 +231,12 @@ def crawlForum(driver):
savePage(driver.page_source, item) savePage(driver.page_source, item)
driver.back() driver.back()
# comment out # # comment out
break # break
#
# comment out # # comment out
if count == 1: # if count == 1:
break # break
try: try:
link = driver.find_element(by=By.XPATH, value= link = driver.find_element(by=By.XPATH, value=


+ 233
- 238
MarketPlaces/LionMarketplace/parser.py View File

@ -11,133 +11,144 @@ from bs4 import BeautifulSoup
#stores info it needs in different lists, these lists are returned after being organized #stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of description page #@param: soup object looking at html page of description page
#return: 'row' that contains a variety of lists that each hold info on the description page #return: 'row' that contains a variety of lists that each hold info on the description page
def darkfox_description_parser(soup): def lionmarketplace_description_parser(soup):
# Fields to be parsed # Fields to be parsed
vendor = "-1" # 0 *Vendor_Name
success = "-1" # 1 Vendor_Successful_Transactions
rating_vendor = "-1" # 2 Vendor_Rating
name = "-1" # 3 *Product_Name
describe = "-1" # 4 Product_Description
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
category = "-1" # 7 Product_Category
views = "-1" # 8 Product_Number_Of_Views
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
BTC = "-1" # 12 Product_BTC_SellingPrice
USD = "-1" # 13 Product_USD_SellingPrice
EURO = "-1" # 14 Product_EURO_SellingPrice
sold = "-1" # 15 Product_QuantitySold
left = "-1" # 16 Product_QuantityLeft
shipFrom = "-1" # 17 Product_ShippedFrom
shipTo = "-1" # 18 Product_ShippedTo
# vendor name
try:
temp = soup.find('div', {'class': 'btn-group'}).find('a').text
vendor = (cleanString(temp.strip()))
except:
print('vendor')
vendor = "-1"
# table with info
table = soup.find('table', {'class', 'table border-0 text-left table-borderless'})
rows = table.findAll('tr')
# successful transaction
success = "-1"
# vendor rating 5
rating_vendor = '-1'
# product name
try:
temp = soup.find('div', {'class', 'row'}).find('h2').text
name = (cleanString(temp.strip()))
except:
name = '-1'
print('product name')
# product description
try:
temp = soup.find('div', {'class': "mt-4"}).findAll('p')
temp = temp[1].text
if "\n" in temp:
temp = temp.replace("\n", " ")
temp = temp.replace("\r", " ")
describe = cleanString(temp.strip())
except:
describe="-1"
print('describe')
CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much
MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much
# product category
try:
temp = rows[1].find('strong').text
category = cleanString(temp.strip())
except:
category = "-1"
print('category')
# product number of views
views = "-1"
reviews = "-1" # 9 Product_Number_Of_Reviews
rating_item = "-1" # 10 Product_Rating
addDate = "-1" # 11 Product_AddedDate
# BTC selling price box box-rounded mt-2
BTC = "-1"
# USD selling price
try:
temp = rows[2].find('strong').text
if " $" in temp:
temp = temp.replace(" $", "")
elif "$" in temp:
temp = temp.replace("$", "")
USD = cleanString((temp.strip()))
except:
try:
temp = soup.find('li').find('strong').text
if " $" in temp:
temp = temp.replace(" $", "")
elif "$" in temp:
temp = temp.replace("$", "")
USD = cleanString((temp.strip()))
except:
print("USD")
EURO = "-1" # 14 Product_EURO_SellingPrice
# product sold
try:
if (len(rows) <= 5):
temp = rows[4].find('td').text
string = cleanString(temp)
if (string == 'Left/Sold'):
temp = rows[4].findAll('td')
temp = temp[1].findAll('span')
# left
temp2 = temp[1].text
temp3 = temp[1].text
if(" items" in temp2):
temp2 = temp2.replace(" items", "")
if(" items" in temp3):
temp3 = temp3.replace(" items", "")
sold = (cleanString(temp2.strip()))
left = cleanString(temp3.strip())
else:
sold = '-1'
left = "-1"
else:
sold = '-1'
left = "-1"
except:
print("success")
sold = '-1'
left = "-1"
name = "-1" # 0 Product_Name shipFrom = "-1" # 17 Product_ShippedFrom
describe = "-1" # 1 Product_Description shipTo = "-1" # 18 Product_ShippedTo
lastSeen = "-1" # 2 Product_LastViewDate
rules = "-1" # 3 NOT USED ...
CVE = "-1" # 4 Product_CVE_Classification (Common Vulnerabilities and Exposures)
MS = "-1" # 5 Product_MS_Classification (Microsoft Security)
review = "-1" # 6 Product_Number_Of_Reviews
category = "-1" # 7 Product_Category
shipFrom = "-1" # 8 Product_ShippedFrom
shipTo = "-1" # 9 Product_ShippedTo
left = "-1" # 10 Product_QuantityLeft
escrow = "-1" # 11 Vendor_Warranty
terms = "-1" # 12 Vendor_TermsAndConditions
vendor = "-1" # 13 Vendor_Name
sold = "-1" # 14 Product_QuantitySold
addDate = "-1" # 15 Product_AddedDate
available = "-1" # 16 NOT USED ...
endDate = "-1" # 17 NOT USED ...
BTC = "-1" # 18 Product_BTC_SellingPrice
USD = "-1" # 19 Product_USD_SellingPrice
rating = "-1" # 20 Vendor_Rating
success = "-1" # 21 Vendor_Successful_Transactions
EURO = "-1" # 22 Product_EURO_SellingPrice
# Finding Product Name
name = soup.find('h1').text
name = name.replace('\n', ' ')
name = name.replace(",", "")
name = name.strip()
# Finding Vendor
vendor = soup.find('h3').find('a').text.strip()
# Finding Vendor Rating
rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
# Finding Successful Transactions
success = soup.find('h3').text
success = success.replace("Vendor: ", "")
success = success.replace(vendor, "")
success = success.replace("(", "")
success = success.replace(")", "")
success = success.strip()
bae = soup.find('div', {'class': "box"}).find_all('ul')
# Finding Prices
USD = bae[1].find('strong').text.strip()
li = bae[2].find_all('li')
# Finding Escrow
escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Category
category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
# Finding the Product Quantity Available
left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
# Finding Number Sold
sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
li = bae[3].find_all('li')
# Finding Shipment Information (Origin)
if "Ships from:" in li[-2].text:
shipFrom = li[-2].text
shipFrom = shipFrom.replace("Ships from: ", "")
# shipFrom = shipFrom.replace(",", "")
shipFrom = shipFrom.strip()
# Finding Shipment Information (Destination)
shipTo = li[-1].find('div', {'title': "List of countries is scrollable"}).text
shipTo = shipTo.replace("Ships to: ", "")
shipTo = shipTo.strip()
if "certain countries" in shipTo:
countries = ""
tags = li[-1].find_all('span', {'class': "tag"})
for tag in tags:
country = tag.text.strip()
countries += country + ", "
shipTo = countries.strip(", ")
# Finding the Product description
describe = soup.find('div', {'class': "pre-line"}).text
describe = describe.replace("\n", " ")
describe = describe.strip()
'''# Finding the Number of Product Reviews
tag = soup.findAll(text=re.compile('Reviews'))
for index in tag:
reviews = index
par = reviews.find('(')
if par >=0:
reviews = reviews.replace("Reviews (","")
reviews = reviews.replace(")","")
reviews = reviews.split(",")
review = str(abs(int(reviews[0])) + abs(int(reviews[1])))
else :
review = "-1"'''
# Searching for CVE and MS categories
cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if cve:
CVE = " "
for idx in cve:
CVE += (idx)
CVE += " "
CVE = CVE.replace(',', ' ')
CVE = CVE.replace('\n', '')
ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}'))
if ms:
MS = " "
for im in ms:
MS += (im)
MS += " "
MS = MS.replace(',', ' ')
MS = MS.replace('\n', '')
# Populating the final variable (this should be a list with all fields scraped) # Populating the final variable (this should be a list with all fields scraped)
row = (name, describe, lastSeen, rules, CVE, MS, review, category, shipFrom, shipTo, left, escrow, terms, vendor, row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate,
sold, addDate, available, endDate, BTC, USD, rating, success, EURO) BTC, USD, EURO, sold, left, shipFrom, shipTo)
# Sending the results # Sending the results
return row return row
@ -147,130 +158,114 @@ def darkfox_description_parser(soup):
#stores info it needs in different lists, these lists are returned after being organized #stores info it needs in different lists, these lists are returned after being organized
#@param: soup object looking at html page of listing page #@param: soup object looking at html page of listing page
#return: 'row' that contains a variety of lists that each hold info on the listing page #return: 'row' that contains a variety of lists that each hold info on the listing page
def darkfox_listing_parser(soup): def lionmarketplace_listing_parser(soup):
# Fields to be parsed # Fields to be parsed
nm = 0 # Total_Products (Should be Integer) nm = 0 # *Total_Products (Should be Integer)
mktName = "DarkFox" # 0 Marketplace_Name mktName = "M00nkeyMarket" # 0 *Marketplace_Name
name = [] # 1 Product_Name vendor = [] # 1 *Vendor y
CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) rating_vendor = [] # 2 Vendor_Rating
MS = [] # 3 Product_MS_Classification (Microsoft Security) success = [] # 3 Vendor_Successful_Transactions
category = [] # 4 Product_Category name = [] # 4 *Product_Name y
describe = [] # 5 Product_Description CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this
escrow = [] # 6 Vendor_Warranty MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this
views = [] # 7 Product_Number_Of_Views category = [] # 7 Product_Category y
reviews = [] # 8 Product_Number_Of_Reviews describe = [] # 8 Product_Description
addDate = [] # 9 Product_AddDate views = [] # 9 Product_Number_Of_Views
lastSeen = [] # 10 Product_LastViewDate reviews = [] # 10 Product_Number_Of_Reviews
BTC = [] # 11 Product_BTC_SellingPrice rating_item = [] # 11 Product_Rating
USD = [] # 12 Product_USD_SellingPrice addDate = [] # 12 Product_AddDate
EURO = [] # 13 Product_EURO_SellingPrice BTC = [] # 13 Product_BTC_SellingPrice
sold = [] # 14 Product_QuantitySold USD = [] # 14 Product_USD_SellingPrice y
qLeft =[] # 15 Product_QuantityLeft EURO = [] # 15 Product_EURO_SellingPrice
shipFrom = [] # 16 Product_ShippedFrom sold = [] # 16 Product_QuantitySold
shipTo = [] # 17 Product_ShippedTo qLeft = [] # 17 Product_QuantityLeft
vendor = [] # 18 Vendor shipFrom = [] # 18 Product_ShippedFrom
rating = [] # 19 Vendor_Rating shipTo = [] # 19 Product_ShippedTo
success = [] # 20 Vendor_Successful_Transactions href = [] # 20 Product_Links
href = [] # 23 Product_Links (Urls) listing = soup.findAll('div', {"class": "card-body"})
listing = soup.findAll('div', {"class": "card"})
# Populating the Number of Products # Populating the Number of Products
nm = len(listing) nm = len(listing)
for a in listing: for a in listing:
bae = a.findAll('a', href=True)
# Adding the url to the list of urls row = a.findAll('p')
link = bae[0].get('href') # vendor
link = cleanLink(link) try:
href.append(link) temp = row[3].text
vendor.append(cleanString(temp.strip()))
# Finding the Product except:
product = bae[1].find('p').text vendor.append("-1")
product = product.replace('\n', ' ') print('vendor')
product = product.replace(",", "") # vendor rating
product = product.replace("...", "") rating_vendor.append("-1")
product = product.strip() # successful transactions CHECK AGAIN HERE
name.append(product) success.append("-1")
# product name
bae = a.find('div', {'class': "media-content"}).find('div').find_all('div') try:
temp = a.find('a').text
if len(bae) >= 5: name.append(cleanString(temp.strip()))
# Finding Prices except:
price = bae[0].text name.append("-1")
ud = price.replace(" USD", " ") print('product name')
# u = ud.replace("$","") CVE.append('-1')
u = ud.replace(",", "") MS.append('-1')
u = u.strip() # product category
USD.append(u) try:
# bc = (prc[1]).strip(' BTC') temp = row[2].text
# BTC.append(bc) if "Category: " in temp:
temp = temp.replace("Category: ", "")
# Finding the Vendor category.append(cleanString(temp.strip()))
vendor_name = bae[1].find('a').text except:
vendor_name = vendor_name.replace(",", "") print("Error in product category")
vendor_name = vendor_name.strip() describe.append('-1')
vendor.append(vendor_name) # product views
views.append("-1")
# Finding the Category reviews.append('-1') # 10 Product_Number_Of_Reviews
cat = bae[2].find('small').text rating_item.append('-1') # 11 Product_Rating
cat = cat.replace("Category: ", "") addDate.append('-1') # 12 Product_AddDate
cat = cat.replace(",", "") # BTC
cat = cat.strip() BTC.append('-1')
category.append(cat) # USD
try:
# Finding Number Sold and Quantity Left temp = row[0].find('strong').text
num = bae[3].text if ' $' in temp:
num = num.replace("Sold: ", "") temp = temp.replace(" $", "")
num = num.strip() USD.append(cleanString(temp.strip())) # 14 Product_USD_SellingPrice
sold.append(num) except:
print("USD")
quant = bae[4].find('small').text USD.append("-1")
quant = quant.replace("In stock: ", "") EURO.append("-1") # 15 Product_EURO_SellingPrice
quant = quant.strip() # product sold
qLeft.append(quant) sold.append("-1")
qLeft.append('-1') # 17 Product_QuantityLeft
# Finding Successful Transactions shipFrom.append('-1') # 18 Product_ShippedFrom
freq = bae[1].text shipTo.append('-1') # 19 Product_ShippedTo
freq = freq.replace(vendor_name, "") # href
freq = re.sub(r'Vendor Level \d+', "", freq) try:
freq = freq.replace("(", "") temp = a.find('a').get('href')
freq = freq.replace(")", "") href.append(temp)
freq = freq.strip() except:
success.append(freq) print('product name')
# Searching for CVE and MS categories
cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}'))
if not cve:
cveValue="-1"
else:
cee = " "
for idx in cve:
cee += (idx)
cee += " "
cee = cee.replace(',', ' ')
cee = cee.replace('\n', '')
cveValue=cee
CVE.append(cveValue)
ms = a.findAll(text=re.compile('MS\d{2}-\d{3}'))
if not ms:
MSValue="-1"
else:
me = " "
for im in ms:
me += (im)
me += " "
me = me.replace(',', ' ')
me = me.replace('\n', '')
MSValue=me
MS.append(MSValue)
# Populate the final variable (this should be a list with all fields scraped) # Populate the final variable (this should be a list with all fields scraped)
return organizeProducts(mktName, nm, name, CVE, MS, category, describe, escrow, views, reviews, addDate, lastSeen, return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views,
BTC, USD, EURO, qLeft, shipFrom, shipTo, vendor, rating, success, sold, href) reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href)
#called by the crawler to get description links on a listing page #called by the crawler to get description links on a listing page


+ 4
- 4
setup.ini View File

@ -1,11 +1,11 @@
[TOR] [TOR]
firefox_binary_path = C:\Users\minhkhoitran\Desktop\Tor Browser\Browser\firefox.exe firefox_binary_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe
firefox_profile_path = C:\Users\minhkhoitran\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default firefox_profile_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Data\\Browser\\profile.default
geckodriver_path = C:\nsf-reu\dw_pipeline_test\selenium\geckodriver.exe geckodriver_path = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test\\selenium\\geckodriver.exe
[Project] [Project]
project_directory = C:\nsf-reu\dw_pipeline_test project_directory = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test
shared_folder = \\VBoxSvr\\Shared shared_folder = \\VBoxSvr\\Shared
[PostgreSQL] [PostgreSQL]


|||||||
x
 
000:0
Loading…
Cancel
Save