diff --git a/MarketPlaces/Ares/crawler_selenium.py b/MarketPlaces/Ares/crawler_selenium.py index 2e0c677..1f865ad 100644 --- a/MarketPlaces/Ares/crawler_selenium.py +++ b/MarketPlaces/Ares/crawler_selenium.py @@ -82,8 +82,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - # ff_prof.set_preference("network.dns.disablePrefetch", True) - # ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -119,6 +119,8 @@ def getAccess(): def login(driver): # input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="username"]'))) # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') @@ -173,19 +175,19 @@ def getInterestedLinks(): links = [] # Digital - Malware - # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2') + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/95c37970-002c-11ec-a5dc-1f4432087ed2') # Digital - Guides (Mostly carding, some useful hacking guides. probably dont use) - # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662') + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/9a8bea70-002b-11ec-a3db-c90dd329f662') # Digital - Hacking - # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921') + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/a81693f0-002b-11ec-9c39-110550ce4921') # Digital - Malware2 links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/b3258c50-002b-11ec-b658-876d3d651145') # Digital - Sofware (50/50 hacking stuff and cracked software) - # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1') + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/cff75df0-002b-11ec-8d0a-81fddeb36bf1') # Digital - Exploits - # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6') + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/ef029550-002f-11ec-8711-675a8b116ba6') # Digital - Tutorials (Mostly random stuff, some useful tutorials, probably dont use) - # links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8') + links.append('http://aresonewgdyfb7oyyanlvp6e75arfivpkuy7rwo5uegayrv7rkztfwad.onion/category/f6e9c3b0-002b-11ec-85aa-c79a6ac8cfe8') return links @@ -230,7 +232,7 @@ def crawlForum(driver): # break try: - link = driver.find_element(by=By.XPATH, value='/html/body/div[6]/div[3]/div/div[2]/nav/ul/li[4]/a').get_attribute('href') + link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "Next")]').get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -242,7 +244,7 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the Ares market done.") + print("Crawling the Ares market done.") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/MarketPlaces/Ares/parser.py b/MarketPlaces/Ares/parser.py index 1803233..597a1eb 100644 --- a/MarketPlaces/Ares/parser.py +++ b/MarketPlaces/Ares/parser.py @@ -49,39 +49,23 @@ def ares_description_parser(soup): # Finding Vendor vendor = soup.find('a', {'class': "btn btn-sm btn-mgray my-1 w-100 text-white"}).get('href') - vendor = vendor.split('otherParty=')[-1].strip() + vendor = vendor.split('otherParty=')[-1] + vendor = cleanString(vendor).strip() # Finding Vendor Rating temp = box[1] - stars = len(temp.findAll('i', {"class": "fas fa-star"})) + rating_vendor = len(temp.findAll('i', {"class": "fas fa-star"})) half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) - rating_vendor = str(((stars - half_stars)/5) * 100) - - # Finding the Product Rating and Number of Product Reviews - # reviews = temp[2].replace(" review)", "") - # reviews = reviews.strip() - # - # temp = temp[1].split(")") - # rating = temp[1].replace("Product Review : ", "") - # rating = rating.replace("%", "") - # rating_item = rating.strip() + if half_stars > 0: + rating_vendor += 0.5 + # Finding Successful Transactions + success = box[2].text + success = cleanNumbers(success).strip() box2 = soup.find('div', {"class": "col-md-4 text-center"}).find('span', {"class": "text-left"}).findAll('span') - # Finding Prices - price = box2[0].text - price = price.replace("$", "") - price = price.replace('\n', '') - price = price.strip() - currency = box2[2].find('i').get('class') - if 'bitcoin' in currency: - BTC = price - elif 'USD' in currency: - USD = price - elif 'monero' in currency: - USD = (str(int(price) * 170.97)) - + # Finding USD USD = box2[0].text USD = USD.replace('\n', '') USD = USD.replace('$', '') @@ -91,40 +75,18 @@ def ares_description_parser(soup): vendor_image = soup.find('img', {"class": 'img-fluid'}).get('src') vendor_image = vendor_image.split('base64,')[-1] - # Finding the Product Category - # pmb = soup.findAll('p', {'class': "mb-1"}) - - # category = pmb[-1].text - # category = category.replace("Category: ", "").strip() - - # Finding the Product Quantity Available - # left = divmb[-1].text - # left = left.split(",", 1)[1] - # left = left.replace("in stock", "") - # left = left.strip() - - # Finding Number Sold - # sold = divmb[-1].text - # sold = sold.split(",", 1)[0] - # sold = sold.replace("sold", "") - # sold = sold.strip() - - # Finding Shipment Information (Origin) - # pmb[0].text - # shipFrom = shipFrom.replace("Ships from: ", "").strip() - - # Finding Shipment Information (Destination) - # pmb[1].text - # shipTo = shipTo.replace("Ships to: ", "").strip() - # Finding the Product description - cardbody = soup.find('div', {"class": 'row-md-12'}).find('div', {"class": 'col-md-4'}).find('textarea', {"class": 'disabled form-control form-control-sm w-100 bg-mgray text-white rounded-0 border-danger'}) - describe = cardbody.text.strip() + temp = soup.find('div', {"class": 'row-md-12'}).find('div', {"class": 'col-md-4'}) + cardbody = temp.find('textarea', {"class": 'disabled form-control form-control-sm w-100 bg-mgray text-white rounded-0 border-danger'}) + describe = cleanString(cardbody.text).strip() # Finding Product Image image = soup.find('div', {"class": 'row-md-12'}).find('div', {"class": 'col-md-4 text-center'}).find('img') - image = image.get('src') - image = image.split('base64,')[-1] + if image is not None: + image = image.get('src') + image = image.split('base64,')[-1] + else: + image = "-1" # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -184,6 +146,9 @@ def ares_listing_parser(soup): image_vendor = [] # 21 Vendor_Image href = [] # 22 Product_Links + cat = soup.find('span', {"class": "btn btn-sm btn-outline-dark w-100 active"}).text + cat = cleanString(cat).strip() + listing = soup.find('div', {"class": 'card-body text-black text-left bg-dark'}).findAll('div', {"class": 'card mb-4 border-danger rounded-0'}) # Populating the Number of Products @@ -191,6 +156,8 @@ def ares_listing_parser(soup): for a in listing: + category.append(cat) + # Adding the url to the list of urls link = a.find('a', {'class': "badge badge-danger w-100 text-white"}).get('href') link = cleanLink(link) @@ -214,14 +181,15 @@ def ares_listing_parser(soup): price = a.findAll('a', {"class": "text-white"})[-1].text price = price.replace("$","") price = price.strip() - currency = a.find('div', {"class": 'card-header bg-mgray rounded-0'}).findAll('i')[1] - if 'bitcoin' in currency.get('class'): - BTC.append(price) - elif 'USD' in currency.get('class'): - USD.append(price) - elif 'monero' in currency.get('class'): - USD.append(str(int(price) * 170.97)) + USD.append(price) + # Finding Item Rating + temp = a.find('small', {"class": "text-white"}) + rating = len(temp.findAll('i', {"class": "fas fa-star"})) + half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) + if half_stars > 0: + rating += 0.5 + rating_item.append(str(rating)) # Finding the Vendor vendor_name = a.find('a', {"class": 'badge badge-dark w-100 text-white my-1'}).text @@ -231,36 +199,6 @@ def ares_listing_parser(soup): image_vendor.append("-1") - # Finding the Category - # cat = lb[-1].find("span").text - # cat = cat.replace("class:", "") - # cat = cat.strip() - # category.append(cat) - - # Finding Number of Views - # num = span[0].text - # num = num.replace("views:", "") - # num = num.strip() - # sold.append(num) - - # Finding Number Sold - # num = span[2].text - # num = num.replace("Sold:", "") - # num = num.strip() - # sold.append(num) - - # Finding Quantity Left - # quant = span[1].text - # quant = quant.replace("stock:", "") - # quant = quant.strip() - # qLeft.append(quant) - - # add shipping information - # ship = lb[2].findAll('small')[1].findAll('span')[1].text.split("->") - # shipFrom.append(ship[0].replace("Ship from ", "").strip()) - # shipTo.append(ship[1].replace("to ", "").strip()) - - # Searching for CVE and MS categories cve = a.findAll(text=re.compile('CVE-\d{4}-\d{4}')) if not cve: @@ -302,11 +240,6 @@ def ares_links_parser(soup): href = [] listing = soup.findAll('div', {"class": "col-md-4 my-md-0 my-2 col-12"}) - # for a in listing: - # bae = a.find('a', {"class": "text-info"}, href=True) - # link = bae['href'] - # href.append(link) - for a in listing: bae = a.findAll('a', href=True) diff --git a/MarketPlaces/Kingdom/crawler_selenium.py b/MarketPlaces/Kingdom/crawler_selenium.py index 5385150..b8e99f0 100644 --- a/MarketPlaces/Kingdom/crawler_selenium.py +++ b/MarketPlaces/Kingdom/crawler_selenium.py @@ -48,6 +48,7 @@ def startCrawling(): new_parse(mktName, baseURL, True) + # Login using premade account credentials and do login captcha manually def login(driver): @@ -75,8 +76,6 @@ def login(driver): (By.XPATH, '/html/body/div/div/div[3]/div[1]/div/div[3]'))) - - # Returns the name of the website def getMKTName(): name = 'Kingdom' @@ -116,8 +115,8 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True) - ff_prof.set_preference("network.http.sendRefererHeader", 0) + # ff_prof.set_preference("network.dns.disablePrefetch", True) + # ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) @@ -186,11 +185,11 @@ def getInterestedLinks(): links = [] # Software and Malware - links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127&t=597a56b9a0b3e0d0') + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=127') # # Services - links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45&t=597a56b9a0b3e0d0') + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=45') # # guides and tutorials - links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=107&t=597a56b9a0b3e0d0') + links.append('http://kingdomm7v6yed55o2rbspvs4exn5bzfxdizqaav27tw6gw4zc65vdad.onion/offers?filter_category=107') return links @@ -236,7 +235,7 @@ def crawlForum(driver): # break try: - link = driver.find_element(by=By.XPATH, value='/html/body/div/div/div[3]/div[2]/div[2]/div[3]/div/ul/li[13]/a').get_attribute('href') + link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "»")]').get_attribute('href') if link == "": raise NoSuchElementException count += 1 @@ -248,7 +247,7 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the Kingdom market done.") + print("Crawling the Kingdom market done.") # Returns 'True' if the link is Topic link diff --git a/MarketPlaces/Kingdom/parser.py b/MarketPlaces/Kingdom/parser.py index abade27..ae75d67 100644 --- a/MarketPlaces/Kingdom/parser.py +++ b/MarketPlaces/Kingdom/parser.py @@ -40,56 +40,49 @@ def kingdom_description_parser(soup): desc = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-cont"}) name = tag.find('div',{"class": "col-md-8"}).find('div', {"class": "box-head"}).text - name = name.replace('\n', ' ') - name = name.replace(',', ' ') - name = name.strip() + name = cleanString(name).strip() # Finding Prices # Kingdom prices can be shown in a variety of currencies, not all in USD, so keeping currency rows = desc.find_all('div', {"class", "row"}, recursive=False) - price = rows[-1].find('div', {"class": "row"}).find('h3').text - price = price.replace(',', '') - price = price.strip() - # USD = price.replace("USD",'') + USD = rows[-1].find('div', {"class": "row"}).find('h3').text + USD = cleanNumbers(USD).strip() BTC = rows[-1].find('div', {"class": "row"}).find_next_sibling('div').find('span').text + BTC = cleanNumbers(BTC).strip() # Finding Vendor vendor = rows[0].select_one('a[href^="/user"]').text - vendor = vendor.replace(",", " ") - vendor = vendor.strip() + vendor = cleanString(vendor).strip() # Finding Shipment Information (Origem) descs = rows[0].find_all('div', {"class": "col-md-3 text-right"}) shipFrom = descs[2].text - shipFrom = shipFrom.replace(",", "") - shipFrom = shipFrom.strip() + shipFrom = cleanString(shipFrom).strip() # Finding Shipment Information (Destiny) shipTo = rows[-1].find('div', {"class": "col-md-6"}).text shipTo = shipTo.replace("Ship to:","") - shipTo = shipTo.replace(",","").strip() - if(shipTo == ''): - shipTo = -1 + shipTo = cleanString(shipTo).strip() + if shipTo == '': + shipTo = "-1" # Finding the Product Category category = descs[0].text - category = category.replace(",", "") - category = category.strip() + category = cleanString(category).strip() # Finding the Product Quantity Available left = descs[1].text - left = left.replace(",", "") - left = left.strip() + left = cleanString(left).strip() # Finding when the Product was Added dt = descs[-1].text.strip() addDate = datetime.strptime(dt, '%d.%m.%Y') # Finding the Product description - describe = cleanString(soup.find('div', {"id": "descriptionContent"}).text) + describe = cleanString(soup.find('div', {"id": "descriptionContent"}).text).strip() # Finding the Number of Product Reviews - review = len(soup.find('div', {"id": "feedbackContent"}).find_all(recursive=False)) + reviews = str(len(soup.find('div', {"id": "feedbackContent"}).find_all(recursive=False))) # Searching for CVE and MS categories # no cve or ms in Kingdom @@ -143,19 +136,13 @@ def kingdom_listing_parser(soup): #in array USD, there may be prices not in USD, so includes currency as well prices = a.find('div', {"class": "col-md-3"}) u = prices.find('h3').text - u = u.strip() - u = u.replace(',', '') - u = u.strip() - USD.append(u) + USD.append(cleanNumbers(u).strip()) bc = prices.find('div').find('span').text - BTC.append(bc) + BTC.append(cleanNumbers(bc).strip()) # Finding the Product product = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]').text - product = product.replace('\n', ' ') - product = product.replace(","," ") - product = product.strip() - name.append(product) + name.append(cleanString(product).strip()) # Finding Product Image product_image = a.find('img') @@ -165,15 +152,22 @@ def kingdom_listing_parser(soup): # Finding the Vendor vendor_name = a.select_one('a[href^="/user"]').text - vendor_name = vendor_name.replace(",", " ").replace('/', '') - vendor_name = vendor_name.strip() - vendor.append(vendor_name) + vendor_name = vendor_name.replace('/', '') + vendor.append(cleanString(vendor_name).strip()) - image_vendor.append("-1") + # Finding Views + product_views = a.find('div', {"class": "col-md-7"}).find_all('p')[0].text + views.append(cleanNumbers(product_views).strip()) + + # Finding Sold + product_sold = a.find('div', {"class": "base-label label label-rounded label-success"}) + if product_sold is not None: + sold.append(cleanNumbers(product_sold.text).strip()) + else: + sold.append("-1") # Adding the url to the list of urls link = a.find('div', {"class": "col-md-7"}).select_one('a[href^="/offer/view?"]')['href'] - link = cleanLink(link) href.append(link) # Searching for CVE and MS categories diff --git a/MarketPlaces/Quest/crawler_selenium.py b/MarketPlaces/Quest/crawler_selenium.py index 213ab24..8a84d68 100644 --- a/MarketPlaces/Quest/crawler_selenium.py +++ b/MarketPlaces/Quest/crawler_selenium.py @@ -118,7 +118,8 @@ def getAccess(): def login(driver): - input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="username"]'))) # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') @@ -128,10 +129,10 @@ def login(driver): # Password here passwordBox.send_keys('Mahogany') # Clicking the login button - login_button = driver.find_element(By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button') - login_button.click() + # login_button = driver.find_element(By.XPATH, value='/html/body/div[2]/div/div[3]/div/div/div/div[1]/form/div[4]/div/div/button') + # login_button.click() - input("Press ENTER when CAPTCHA is completed and you exit the newsletter\n") + input("Press ENTER when CAPTCHA is completed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( @@ -175,6 +176,8 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] + ## Services + links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/8ae67900-22ed-11ec-a710-31f963ce8d35') ## Software links.append('http://questxwvkwvsw2qgeeljz4fbv6cq2kbmapo7tw5heu4nng2ufgykapid.onion/category/92809300-22ed-11ec-b143-af312e1dab77') ## Tutorial @@ -245,7 +248,7 @@ def crawlForum(driver): print(link, e) i += 1 - print("Crawling the Quest market done.") + print("Crawling the Quest market done.") # Returns 'True' if the link is Topic link, may need to change for every website diff --git a/MarketPlaces/Quest/parser.py b/MarketPlaces/Quest/parser.py index d9c96e3..6852b04 100644 --- a/MarketPlaces/Quest/parser.py +++ b/MarketPlaces/Quest/parser.py @@ -37,13 +37,15 @@ def quest_description_parser(soup): vendor_image = "-1" # 20 Vendor_Image # Finding Product Name - name = soup.find('div', class_='card-header bg-dark text-white rounded-0 text-center').text.strip() + name = soup.find('div', class_='card-header bg-dark text-white rounded-0 text-center').text + name = cleanString(name).strip() # USD Price - USD = soup.find('small', text='Product Price:').find_next('small').text.strip().replace('$', '') + USD = soup.find('small', text='Product Price:').find_next('small').text.replace('$', '').strip() # Product Description - describe = soup.find('textarea').text.strip() + describe = soup.find('textarea').text + describe = cleanString(describe).strip() # Finding Product Image image = soup.find('img', {'class': 'img-fluid'}) @@ -53,6 +55,23 @@ def quest_description_parser(soup): vendor_image = soup.select_one('.card-body.bg-mgray.css-selector.shadow img') vendor_image = vendor_image.get('src').split('base64,')[-1] + # Finding Successful Transactions + success = soup.find('strong', text='Total Sales:').parent.text + success = cleanNumbers(success).strip() + + # Finding Vendor Rating + temp = soup.find('strong', text='Rating:').parent + rating_vendor = len(temp.findAll('i', {"class": "fas fa-star"})) + half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) + if half_stars > 0: + rating_vendor += 0.5 + + # Finding Item Rating + temp = soup.find('small', text='Average Product Score:').find_next('small') + rating_item = len(temp.findAll('i', {"class": "fas fa-star"})) + half_stars = len(temp.findAll('i', {'class': "fas fa-star-half-alt"})) + if half_stars > 0: + rating_item += 0.5 # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, @@ -107,7 +126,7 @@ def quest_listing_parser(soup): # Using the first tag as default product_link_tag = product_link_tags[0] href.append(product_link_tag['href']) - name.append(product_link_tag.text.strip()) + name.append(cleanString(product_link_tag.text).strip()) # Extracting Product Image img_tag = a.find('img') @@ -118,7 +137,7 @@ def quest_listing_parser(soup): # Extracting Vendor Name vendor_tag = a.find('a', class_='badge-dark') if vendor_tag: - vendor.append(vendor_tag.text.replace('👤', '').strip()) + vendor.append(cleanString(vendor_tag.text.replace('👤', '')).strip()) # Extracting Product Price in USD price_tag = a.find('a', class_='text') @@ -127,8 +146,7 @@ def quest_listing_parser(soup): category_tag = soup.find('span', class_= 'btn btn-sm btn-outline-mgray active border-info') if category_tag: - category.append(category_tag.text.strip()) - + category.append(cleanString(category_tag.text).strip()) # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, diff --git a/MarketPlaces/Sonanza/crawler_selenium.py b/MarketPlaces/Sonanza/crawler_selenium.py new file mode 100644 index 0000000..c0ea415 --- /dev/null +++ b/MarketPlaces/Sonanza/crawler_selenium.py @@ -0,0 +1,263 @@ +__author__ = 'DarkWeb' + +''' +Sonanza Marketplace Crawler (Selenium) +''' + +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.firefox.firefox_profile import FirefoxProfile +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.service import Service +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.ui import Select +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By + +from PIL import Image +import urllib.parse as urlparse +import os, re, time +from datetime import date +import subprocess +import configparser +from bs4 import BeautifulSoup +from MarketPlaces.Initialization.prepare_parser import new_parse +from MarketPlaces.Sonanza.parser import sonanza_links_parser +from MarketPlaces.Utilities.utilities import cleanHTML + +counter = 1 +baseURL = 'http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/' + + +def startCrawling(): + mktName = getMKTName() + driver = getAccess() + + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closeDriver(driver) + + new_parse(mktName, baseURL, True) + + +# Returns the name of the website +def getMKTName(): + name = 'Sonanza' + return name + + +# Return the base link of the website +def getFixedURL(): + url = 'http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/' + return url + + +# Closes Tor Browser +def closeDriver(driver): + # global pid + # os.system("taskkill /pid " + str(pro.pid)) + # os.system("taskkill /t /f /im tor.exe") + print('Closing Tor...') + driver.close() + time.sleep(3) + return + + +# Creates FireFox 'driver' and configure its 'Profile' +# to use Tor proxy and socket +def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) + + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) + ff_prof.set_preference("places.history.enabled", False) + ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) + ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) + ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True) + ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) + ff_prof.set_preference("signon.rememberSignons", False) + ff_prof.set_preference("network.cookie.lifetimePolicy", 2) + ff_prof.set_preference("network.dns.disablePrefetch", True) + ff_prof.set_preference("network.http.sendRefererHeader", 0) + ff_prof.set_preference("permissions.default.image", 3) + ff_prof.set_preference("browser.download.folderList", 2) + ff_prof.set_preference("browser.download.manager.showWhenStarting", False) + ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") + ff_prof.set_preference('network.proxy.type', 1) + ff_prof.set_preference("network.proxy.socks_version", 5) + ff_prof.set_preference('network.proxy.socks', '127.0.0.1') + ff_prof.set_preference('network.proxy.socks_port', 9150) + ff_prof.set_preference('network.proxy.socks_remote_dns', True) + ff_prof.set_preference("javascript.enabled", False) + ff_prof.update_preferences() + + service = Service(config.get('TOR', 'geckodriver_path')) + + driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + + driver.maximize_window() + + return driver + + +#the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' +def getAccess(): + url = getFixedURL() + driver = createFFDriver() + try: + driver.get(url) + return driver + except: + driver.close() + return 'down' + + +def login(driver): + # input("Press ENTER when CAPTCHA is complete and login page has loaded\n") + # + # # entering username and password into input boxes + # usernameBox = driver.find_element(by=By.XPATH, value='//input[@name="username"]') + # # Username here + # usernameBox.send_keys('aliciamykeys') + # passwordBox = driver.find_element(by=By.XPATH, value='//input[@name="password"]') + # # Password here + # passwordBox.send_keys('aliciawherearemykey$') + # # session time + # session_select = Select(driver.find_element(by=By.XPATH, value='/html/body/main/div/div/div/div/div/form/div[4]/div/div[2]/select')) + # session_select.select_by_visible_text('Session 60min') + + input("Press ENTER when CAPTCHA is completed and listing page loaded\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="searchbar"]'))) + + +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) + filePath = getFullPathName(url) + os.makedirs(os.path.dirname(filePath), exist_ok=True) + open(filePath, 'wb').write(cleanPage.encode('utf-8')) + return + + +def getFullPathName(url): + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") + fileName = getNameFromURL(url) + if isDescriptionLink(url): + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') + else: + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') + return fullPath + + +def getNameFromURL(url): + global counter + name = ''.join(e for e in url if e.isalnum()) + if name == '': + name = str(counter) + counter = counter + 1 + return name + + +def getInterestedLinks(): + links = [] + + # Guides and Tutorials + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/3') + # Software and Malware + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/4') + # Fraud + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/5') + # Digital Products + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/21') + # Services + links.append('http://sonanzazddbd3lqw2ai6uwmnb4fx7tj7h6hmdwkfqe7bnzgpuvkxqpyd.onion/category/22') + + return links + + +def crawlForum(driver): + + print("Crawling the Sonanza market") + + linksToCrawl = getInterestedLinks() + + i = 0 + while i < len(linksToCrawl): + link = linksToCrawl[i] + print('Crawling :', link) + try: + has_next_page = True + count = 0 + + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(driver, html, link) + + list = productPages(html) + + for item in list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver, driver.page_source, item) + driver.back() + + # # comment out + # break + # + # # comment out + # if count == 1: + # break + + try: + link = driver.find_element(by=By.XPATH, value='//a[contains(text(), "›")]').get_attribute('href') + if link == "": + raise NoSuchElementException + count += 1 + + except NoSuchElementException: + has_next_page = False + + except Exception as e: + print(link, e) + i += 1 + + print("Crawling the Sonanza market done.") + + +# Returns 'True' if the link is Topic link, may need to change for every website +def isDescriptionLink(url): + if 'article' in url: + return True + return False + + +# Returns True if the link is a listingPage link, may need to change for every website +def isListingLink(url): + if 'category' in url: + return True + return False + + +def productPages(html): + soup = BeautifulSoup(html, "html.parser") + return sonanza_links_parser(soup) + + +def crawler(): + startCrawling() diff --git a/MarketPlaces/Sonanza/parser.py b/MarketPlaces/Sonanza/parser.py new file mode 100644 index 0000000..10166f5 --- /dev/null +++ b/MarketPlaces/Sonanza/parser.py @@ -0,0 +1,238 @@ +__author__ = 'DarkWeb' + +# Here, we are importing the auxiliary functions to clean or convert data +from MarketPlaces.Utilities.utilities import * + +# Here, we are importing BeautifulSoup to search through the HTML tree +from bs4 import BeautifulSoup + + +# parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of description page +# return: 'row' that contains a variety of lists that each hold info on the description page +def sonanza_description_parser(soup): + # Fields to be parsed + + vendor = "-1" # 0 *Vendor_Name + success = "-1" # 1 Vendor_Successful_Transactions + rating_vendor = "-1" # 2 Vendor_Rating + name = "-1" # 3 *Product_Name + describe = "-1" # 4 Product_Description + CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = "-1" # 6 Product_MS_Classification (Microsoft Security) + category = "-1" # 7 Product_Category + views = "-1" # 8 Product_Number_Of_Views + reviews = "-1" # 9 Product_Number_Of_Reviews + rating_item = "-1" # 10 Product_Rating + addDate = "-1" # 11 Product_AddedDate + BTC = "-1" # 12 Product_BTC_SellingPrice + USD = "-1" # 13 Product_USD_SellingPrice + EURO = "-1" # 14 Product_EURO_SellingPrice + sold = "-1" # 15 Product_QuantitySold + left = "-1" # 16 Product_QuantityLeft + shipFrom = "-1" # 17 Product_ShippedFrom + shipTo = "-1" # 18 Product_ShippedTo + image = "-1" # 19 Product_Image + vendor_image = "-1" # 20 Vendor_Image + + listing = soup.find('div', {"id": "article_page"}) + + # Finding the Product + name = listing.find('div', {"class": "row box"}).text + name = cleanString(name).strip() + + # Finding Product Image + product_image = listing.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image = product_image + + table = listing.find('div', {"class": "col-md-5"}) + + # Finding Prices + USD = table.find('span', {"class": "pr"}).text + USD = USD.replace("$", "").strip() + + BTC = table.find_all('span', {"class": "pr1"})[1].text + BTC = BTC.replace("BTC", "").strip() + + rows = table.find_all('p', {"class": "mb-0"}) + for row in rows: + temp = row.text + if "CATEGORY" in temp: + category = temp.replace("CATEGORY :", "") + category = cleanString(category).strip() + elif "VENDOR LEVEL" in temp: + rating_vendor = temp.replace("VENDOR LEVEL :", "") + rating_vendor = cleanString(rating_vendor).strip() + + rows = listing.find_all('p', {"class": "mb-1"}) + for row in rows: + temp = row.text + if "VENDOR" in temp: + vendor = temp.replace("VENDOR :", "") + vendor = cleanString(vendor).strip() + elif "SHIPS TO" in temp: + shipTo = temp.replace("SHIPS TO :", "") + shipTo = cleanString(shipTo).strip() + elif "SOLD" in temp: + sold = cleanNumbers(temp).strip() + + # Finding Product Description + describe = listing.find('pre').text + describe = cleanString(describe).strip() + + # Searching for CVE and MS categories + cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if cve: + CVE = " " + for idx in cve: + CVE += (idx) + CVE += " " + CVE = CVE.replace(',', ' ') + CVE = CVE.replace('\n', '') + ms = soup.findAll(text=re.compile('MS\d{2}-\d{3}')) + if ms: + MS = " " + for im in ms: + MS += (im) + MS += " " + MS = MS.replace(',', ' ') + MS = MS.replace('\n', '') + + # Populating the final variable (this should be a list with all fields scraped) + row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, + BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) + + # Sending the results + return row + + +# parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs +# stores info it needs in different lists, these lists are returned after being organized +# @param: soup object looking at html page of listing page +# return: 'row' that contains a variety of lists that each hold info on the listing page +def sonanza_listing_parser(soup): + + # Fields to be parsed + nm = 0 # *Total_Products (Should be Integer) + mktName = "Sonanza" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about this + MS = [] # 6 Product_MS_Classification (Microsoft Security) dont worry about this + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft = [] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + image = [] # 20 Product_Image + image_vendor = [] # 21 Vendor_Image + href = [] # 22 Product_Links + + listings = soup.findAll('div', {"class": "col-sm-4 col-md-3"}) + + # Populating the Number of Products + nm = len(listings) + + for listing in listings: + + # Adding the url to the list of urls + bae = listing.find('a', href=True) + link = bae.get('href') + href.append(link) + + # Finding Product Image + product_image = listing.find('img') + product_image = product_image.get('src') + product_image = product_image.split('base64,')[-1] + image.append(product_image) + + # Finding the Product + product = listing.find('h5', {"class": "art_title"}).text + product = cleanString(product) + name.append(product.strip()) + + # Finding Prices + price = listing.find('span', {"class": "priceP"}).text + price = price.replace("$", "") + USD.append(price.strip()) + + rows = listing.find_all('p', {"class": "mb-0 card-text"}) + for row in rows: + temp = row.text + if "CATEGORY" in temp: + cat = temp.replace("CATEGORY :", "") + cat = cleanString(cat) + category.append(cat.strip()) + elif "VENDOR" in temp: + vendor_name = temp.replace("VENDOR :", "") + vendor_name = cleanString(vendor_name) + vendor.append(vendor_name.strip()) + + # Finding Vendor Rating + rating = listing.find('span', {"class": "badge badge-info"}).text + rating = rating.replace("VENDOR LEVEL :", "") + rating = cleanString(rating) + rating_vendor.append(rating.strip()) + + # Searching for CVE and MS categories + cve = listing.findAll(text=re.compile('CVE-\d{4}-\d{4}')) + if not cve: + cveValue = "-1" + else: + cee = " " + for idx in cve: + cee += (idx) + cee += " " + cee = cee.replace(',', ' ') + cee = cee.replace('\n', '') + cveValue = cee + CVE.append(cveValue) + + ms = listing.findAll(text=re.compile('MS\d{2}-\d{3}')) + if not ms: + MSValue = "-1" + else: + me = " " + for im in ms: + me += (im) + me += " " + me = me.replace(',', ' ') + me = me.replace('\n', '') + MSValue = me + MS.append(MSValue) + + # Populate the final variable (this should be a list with all fields scraped) + return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, + reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) + + +# called by the crawler to get description links on a listing page +# @param: beautifulsoup object that is using the correct html page (listing page) +# return: list of description links from a listing page +def sonanza_links_parser(soup): + # Returning all links that should be visited by the Crawler + + href = [] + listings = soup.findAll('div', {"class": "col-sm-4 col-md-3"}) + + for listing in listings: + a = listing.find('a', href=True) + + # Adding the url to the list of urls + link = a.get('href') + href.append(link) + + return href \ No newline at end of file