From e12abc1fa500ae41de17a23820dc0867ea2ed0ff Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Wed, 13 Sep 2023 16:49:57 -0700 Subject: [PATCH] finished fully running completed markets --- MarketPlaces/AnonMarket/crawler_selenium.py | 80 ++++++------- MarketPlaces/DB_Connection/db_connection.py | 106 ++++++++++-------- MarketPlaces/DarkMatter/crawler_selenium.py | 28 ++--- MarketPlaces/DarkMatter/parser.py | 11 +- .../DigitalThriftShop/crawler_selenium.py | 36 +++--- MarketPlaces/DigitalThriftShop/parser.py | 17 +-- MarketPlaces/HiddenMarket/crawler_selenium.py | 52 ++++----- MarketPlaces/HiddenMarket/parser.py | 20 ++-- MarketPlaces/Initialization/prepare_parser.py | 8 +- .../LionMarketplace/crawler_selenium.py | 44 +++++--- MarketPlaces/LionMarketplace/parser.py | 82 ++++++-------- .../MetaVerseMarket/crawler_selenium.py | 30 ++--- MarketPlaces/MetaVerseMarket/parser.py | 92 +++++++-------- MarketPlaces/Nexus/crawler_selenium.py | 61 +++++----- MarketPlaces/Nexus/parser.py | 37 +++++- MarketPlaces/RobinhoodMarket/parser.py | 2 +- MarketPlaces/ThiefWorld/crawler_selenium.py | 29 ++--- MarketPlaces/Tor2door/crawler_selenium.py | 30 ++--- MarketPlaces/TorBay/crawler_selenium.py | 16 +-- MarketPlaces/TorBay/parser.py | 2 + MarketPlaces/TorMarket/crawler_selenium.py | 20 ++-- MarketPlaces/TorMarket/parser.py | 40 ++++--- MarketPlaces/Utilities/utilities.py | 4 +- 23 files changed, 438 insertions(+), 409 deletions(-) diff --git a/MarketPlaces/AnonMarket/crawler_selenium.py b/MarketPlaces/AnonMarket/crawler_selenium.py index e5f5a3d..eab9ea0 100644 --- a/MarketPlaces/AnonMarket/crawler_selenium.py +++ b/MarketPlaces/AnonMarket/crawler_selenium.py @@ -162,40 +162,40 @@ def getInterestedLinks(): # Malware links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/malware') - # # Bootkits - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/bootkits') - # # Backdoors - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/backdoors') - # # Keyloggers - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/keyloggers') - # # Wireless Trackers - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wireless_trackers') - # # Screen Scrapers - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/screen_scrapers') - # # Mobile Forensic Tools - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/mobile_forensics_tools') - # # Wifi Jammers - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wifi_jammers') - # # Carding - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/carding') - # # Worms - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/worms') - # # Viruses - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/viruses') - # # Trojans - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/trojans') - # # Botnets - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/botnets') - # # Security Technology - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security_technology') - # # Hacks - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/hacks') - # # Exploit kits - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/exploit_kit') - # # Security - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security') - # # Ransomware - # links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/ransomware') + # Bootkits + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/bootkits') + # Backdoors + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/backdoors') + # Keyloggers + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/keyloggers') + # Wireless Trackers + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wireless_trackers') + # Screen Scrapers + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/screen_scrapers') + # Mobile Forensic Tools + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/mobile_forensics_tools') + # Wifi Jammers + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/wifi_jammers') + # Carding + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/carding') + # Worms + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/worms') + # Viruses + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/viruses') + # Trojans + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/trojans') + # Botnets + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/botnets') + # Security Technology + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security_technology') + # Hacks + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/hacks') + # Exploit kits + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/exploit_kit') + # Security + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/security') + # Ransomware + links.append('http://2r7wa5og3ly4umqhmmqqytae6bufl5ql5kz7sorndpqtrkc2ri7tohad.onion/category/ransomware') return links @@ -235,12 +235,12 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() # Go back to listing after visiting each product - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break # Locate the next page link try: diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index 130f5ab..b8ef27a 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -34,7 +34,7 @@ def verifyMarketPlace(cur, nameMarket): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['market_id'] else: return 0 @@ -54,7 +54,7 @@ def verifyVendor(cur, nameVendor, marketId): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['vendor_id'] else: return 0 @@ -73,7 +73,7 @@ def verifyItem(cur, marketId, vendorId, nameItem): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['item_id'] else: return 0 @@ -91,7 +91,7 @@ def getLastMarketPlace(cur): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['market_id'] else: return 0 @@ -110,7 +110,7 @@ def getLastVendor(cur): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['vendor_id'] else: return 0 @@ -128,7 +128,7 @@ def getLastVendorVersion(cur, vendorId): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['version_vendor'] else: return 0 @@ -146,7 +146,7 @@ def getLastItem(cur): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['item_id'] else: return 0 @@ -165,7 +165,7 @@ def getLastItemVersion(cur, itemId): recset = cur.fetchall() if recset: - return recset[0][0] + return recset[0]['version_item'] else: return 0 @@ -225,9 +225,9 @@ def create_vendor(cur, row, marketId): # decode_decrypt_image_in_base64(recset[0][5]) - if (str(recset[0][3]) != str(row[2] if row[2] != '-1' else None) or # there was a change in the vendor information - str(recset[0][4]) != str(row[3] if row[3] != '-1' else None) or - str(recset[0][5]) != str(row[21] if row[21] != '-1' else None)): + if (str(recset[0]['rating_vendor']) != str(row[2] if row[2] != '-1' else None) or # there was a change in the vendor information + str(recset[0]['successfultransactions_vendor']) != str(row[3] if row[3] != '-1' else None) or + str(recset[0]['image_vendor']) != str(row[21] if row[21] != '-1' else None)): vendorVersionId = int(getLastVendorVersion(cur, vendorId) + 1) @@ -236,11 +236,11 @@ def create_vendor(cur, row, marketId): "Values (%s, %s, %s, %s, %s, %s, %s, %s)" recset = [vendorId, vendorVersionId, marketId, - recset[0][2], - recset[0][3], - recset[0][4], - recset[0][5], - recset[0][6]] + recset[0]['name_vendor'], + recset[0]['rating_vendor'], + recset[0]['successfultransactions_vendor'], + recset[0]['image_vendor'], + recset[0]['dateinserted_vendor']] cur.execute(sql, recset) @@ -308,14 +308,22 @@ def create_items(cur, row, marketId, vendorId): # decode_decrypt_image_in_base64(recset[0][20]) - if (str(recset[0][4]) != str(row[5] if row[5] != '-1' else None) or str(recset[0][5]) != str(row[6] if row[6] != '-1' else None) or - str(recset[0][6]) != str(row[7] if row[7] != '-1' else None) or str(recset[0][7]) != str(row[8] if row[8] != '-1' else None) or - str(recset[0][8]) != str(row[9] if row[9] != '-1' else None) or str(recset[0][9]) != str(row[10] if row[10] != '-1' else None) or - str(recset[0][10]) != str(row[11] if row[11] != '-1' else None) or str(recset[0][11]) != str(row[12] if row[12] != '-1' else None) or - str(recset[0][12]) != str(row[13] if row[13] != '-1' else None) or str(recset[0][13]) != str(row[14] if row[14] != '-1' else None) or - str(recset[0][14]) != str(row[15] if row[15] != '-1' else None) or str(recset[0][15]) != str(row[16] if row[16] != '-1' else None) or - str(recset[0][16]) != str(row[17] if row[17] != '-1' else None) or str(recset[0][17]) != str(row[18] if row[18] != '-1' else None) or - str(recset[0][18]) != str(row[19] if row[19] != '-1' else None) or str(recset[0][20]) != str(row[20] if row[20] != '-1' else None)): + if (str(recset[0]['description_item']) != str(row[5] if row[5] != '-1' else None) or + str(recset[0]['cve_item']) != str(row[6] if row[6] != '-1' else None) or + str(recset[0]['ms_item']) != str(row[7] if row[7] != '-1' else None) or + str(recset[0]['category_item']) != str(row[8] if row[8] != '-1' else None) or + str(recset[0]['views_item']) != str(row[9] if row[9] != '-1' else None) or + str(recset[0]['reviews_item']) != str(row[10] if row[10] != '-1' else None) or + str(recset[0]['rating_item']) != str(row[11] if row[11] != '-1' else None) or + str(recset[0]['dateadded_item']) != str(row[12] if row[12] != '-1' else None) or + str(recset[0]['btc_item']) != str(row[13] if row[13] != '-1' else None) or + str(recset[0]['usd_item']) != str(row[14] if row[14] != '-1' else None) or + str(recset[0]['euro_item']) != str(row[15] if row[15] != '-1' else None) or + str(recset[0]['quantitysold_item']) != str(row[16] if row[16] != '-1' else None) or + str(recset[0]['quantityleft_item']) != str(row[17] if row[17] != '-1' else None) or + str(recset[0]['shippedfrom_item']) != str(row[18] if row[18] != '-1' else None) or + str(recset[0]['shippedto_item']) != str(row[19] if row[19] != '-1' else None) or + str(recset[0]['image_item']) != str(row[20] if row[20] != '-1' else None)): itemVersionId = int(getLastItemVersion(cur, itemId) + 1) @@ -326,27 +334,27 @@ def create_items(cur, row, marketId, vendorId): "%s, %s, %s, %s, %s, %s)" recset = [itemId, itemVersionId, marketId, vendorId, - recset[0][3], - recset[0][4], - recset[0][5], - recset[0][6], - recset[0][7], - recset[0][8], - recset[0][9], - recset[0][10], - recset[0][11], - recset[0][12], - recset[0][13], - recset[0][14], - recset[0][15], - recset[0][16], - recset[0][17], - recset[0][18], - recset[0][19], - recset[0][20], - recset[0][21], - recset[0][22], - recset[0][23]] + recset[0]['name_item'], + recset[0]['description_item'], + recset[0]['cve_item'], + recset[0]['ms_item'], + recset[0]['category_item'], + recset[0]['views_item'], + recset[0]['reviews_item'], + recset[0]['rating_item'], + recset[0]['dateadded_item'], + recset[0]['btc_item'], + recset[0]['usd_item'], + recset[0]['euro_item'], + recset[0]['quantitysold_item'], + recset[0]['quantityleft_item'], + recset[0]['shippedfrom_item'], + recset[0]['shippedto_item'], + recset[0]['lastseen_item'], + recset[0]['image_item'], + recset[0]['href_item'], + recset[0]['dateinserted_item'], + recset[0]['classification_item']] cur.execute(sql, recset) @@ -401,7 +409,7 @@ def create_database(cur, con): sql = "create table vendors(vendor_id integer not null, market_id integer not null, name_vendor character " \ "varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor integer " \ - "null, image_vendor character varying(1000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ + "null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ "constraint vendors_pk primary key (vendor_id), constraint vendors_market_id_fkey foreign key (market_id) references marketplaces (" \ "market_id))" cur.execute(sql) @@ -411,7 +419,7 @@ def create_database(cur, con): sql = "create table vendors_history(vendor_id integer not null, version_vendor integer not null, market_id integer not null, name_vendor " \ "character varying(255) not null, rating_vendor character varying(255), successfultransactions_vendor " \ - "integer null, image_vendor character varying(1000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ + "integer null, image_vendor character varying(10000000) null, dateinserted_vendor timestamp(6) with time zone not null, " \ "constraint vendors_history_pk primary key (vendor_id, version_vendor), constraint vendors_history_vendor_id_fkey foreign key (" \ "vendor_id) references vendors (vendor_id), constraint vendors_history_market_id_fkey foreign key (" \ "market_id) references marketplaces (market_id))" @@ -424,7 +432,7 @@ def create_database(cur, con): "character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \ "null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \ "character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \ - "varying(255) null, lastseen_item timestamp(6) with time zone not null, image_item character varying(1000000) null, " \ + "varying(255) null, lastseen_item timestamp(6) with time zone not null, image_item character varying(10000000) null, " \ "href_item character varying(255) not null, dateinserted_item timestamp(6) with time zone not null, " \ "classification_item double precision not null, constraint items_pk primary key (item_id), constraint " \ "items_market_id_fkey foreign key (market_id) references marketplaces (market_id),constraint " \ @@ -441,7 +449,7 @@ def create_database(cur, con): "character varying(25) null, btc_item character varying(255) null, usd_item character varying(255) " \ "null, euro_item character varying(255) null, quantitysold_item integer null, quantityleft_item " \ "character varying(255) null, shippedfrom_item character varying(255) null, shippedto_item character " \ - "varying(255) null, lastseen_item timestamp(6) with time zone not null, image_item character varying(1000000) null, " \ + "varying(255) null, lastseen_item timestamp(6) with time zone not null, image_item character varying(10000000) null, " \ "href_item character varying(255) not null, dateinserted_item timestamp(6) with time zone not null, " \ "classification_item double precision not null, constraint items_history_pk primary key (item_id, version_item), " \ "constraint items_history_market_id_fkey foreign key (market_id) references marketplaces (market_id), " \ diff --git a/MarketPlaces/DarkMatter/crawler_selenium.py b/MarketPlaces/DarkMatter/crawler_selenium.py index a390abf..b75eea5 100644 --- a/MarketPlaces/DarkMatter/crawler_selenium.py +++ b/MarketPlaces/DarkMatter/crawler_selenium.py @@ -171,14 +171,14 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # digital fraud software - # links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=76') - # # legit - # links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=78') - # # hack guides - # links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=94') - # # services - # links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=117') + # digital fraud software + links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=76') + # legit + links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=78') + # hack guides + links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=94') + # services + links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=117') # software/malware links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=121') @@ -221,12 +221,12 @@ def crawlForum(driver): time.sleep(3) # to keep from detecting click speed driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href') diff --git a/MarketPlaces/DarkMatter/parser.py b/MarketPlaces/DarkMatter/parser.py index 9ff203f..2a681bc 100644 --- a/MarketPlaces/DarkMatter/parser.py +++ b/MarketPlaces/DarkMatter/parser.py @@ -98,8 +98,11 @@ def darkmatter_description_parser(soup): sold = cleanString(temp2.strip()) # Finding Product Image - image = soup.find('td', {"class": "vtop"}).find('img').get('src') - image = image.split('base64,')[-1] + image = soup.find('td', {"class": "vtop"}).find('img') + if image is not None: + image = image.get('src').split('base64,')[-1] + else: + image = '-1' # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, @@ -163,6 +166,10 @@ def darkmatter_listing_parser(soup): index = temp.index("pks x ") result = temp[index + len("pks x "):] name.append(cleanString(result)) + elif ("job x " in temp): + index = temp.index("job x ") + result = temp[index + len("job x "):] + name.append(cleanString(result)) CVE.append("-1") MS.append("-1") diff --git a/MarketPlaces/DigitalThriftShop/crawler_selenium.py b/MarketPlaces/DigitalThriftShop/crawler_selenium.py index 3162a74..163e135 100644 --- a/MarketPlaces/DigitalThriftShop/crawler_selenium.py +++ b/MarketPlaces/DigitalThriftShop/crawler_selenium.py @@ -89,7 +89,7 @@ def createFFDriver(): ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 2) + ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") @@ -172,16 +172,18 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] + # Apps + links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/apps/') + # Books + links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/books/') # Bot nets links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/botnets/') - # # data leak - # links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/dataleak/') - # # databases - # links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/databases/') - # # ransomware - # links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/ransomware/') - # # rats - # links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/rats/') + # ransomware + links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/ransomware/') + # rats + links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/rats/') + # scripts + links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/scripts/') return links @@ -220,16 +222,16 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - link = driver.find_element(by=By.XPATH, value= - '/html/body/div[1]/div[2]/div/div[2]/main/div[1]/nav/ul/li[5]/a').get_attribute('href') + nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/div[2]/main/div[1]/nav') + link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='→').get_attribute('href') if link == "": raise NoSuchElementException count += 1 diff --git a/MarketPlaces/DigitalThriftShop/parser.py b/MarketPlaces/DigitalThriftShop/parser.py index ad275e2..8a4126c 100644 --- a/MarketPlaces/DigitalThriftShop/parser.py +++ b/MarketPlaces/DigitalThriftShop/parser.py @@ -50,22 +50,17 @@ def digitalThriftShop_description_parser(soup: Tag): product_category = soup.find("span", {"class": "posted_in"}).find("a").text category = cleanString(product_category.strip()) - - try: - product_rating: Tag = soup.find("div", {"class": "woocommerce-product-rating"}) + + product_rating: Tag = soup.find("div", {"class": "woocommerce-product-rating"}) + if product_rating is not None: rating_item = product_rating.find("strong", {"class": "rating"}).text - reviews = product_rating.find("span", {"Class": "rating"}).text - - except Exception as e: - pass + reviews = product_rating.find("span", {"class": "rating"}).text product_BTC = soup.find("div", {"id": "price-BTC"}).find("span", {"class": "priceinfo cw-noselect"}).text BTC = cleanString(product_BTC.strip()) product_USD = soup.find("span", {"class": "woocommerce-Price-amount amount"}).text USD = cleanString(product_USD.replace("$", "").strip()) - - # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, @@ -110,9 +105,7 @@ def digitalThriftShop_listing_parser(soup: Tag): product_category = soup.find("h1", {"class": "woocommerce-products-header__title page-title"}).text products_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-5"}).find_all("li") - - - + for product in products_list: nm += 1 vendor.append(mktName) diff --git a/MarketPlaces/HiddenMarket/crawler_selenium.py b/MarketPlaces/HiddenMarket/crawler_selenium.py index fb466fa..533129a 100644 --- a/MarketPlaces/HiddenMarket/crawler_selenium.py +++ b/MarketPlaces/HiddenMarket/crawler_selenium.py @@ -198,28 +198,28 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Civil Software - # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/civil_softwares') - # # Tutorials - Carding - # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/carding') + # Civil Software + links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/civil_softwares') + # Tutorials - Carding + links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/carding') # Digital - Hacks links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/hacks') - # # Digital - Exploit Kit - # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/exploit_kit') - # # 0Day - # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/0day') - # # Digital Forensics - # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/digital_forensics') - # # Tutorials - Mining - # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/mining') - # # Tutorials - Worms - # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/worms') - # # Tutorials - Viruses - # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/viruses') - # # Tutorials - Trojans - # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/trojans') - # # Tutorials - Botnets - # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/botnets') + # Digital - Exploit Kit + links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/exploit_kit') + # 0Day + links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/0day') + # Digital Forensics + links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/digital_forensics') + # Tutorials - Mining + links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/mining') + # Tutorials - Worms + links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/worms') + # Tutorials - Viruses + links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/viruses') + # Tutorials - Trojans + links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/trojans') + # Tutorials - Botnets + links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/botnets') return links @@ -262,12 +262,12 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: pageCount += 1 diff --git a/MarketPlaces/HiddenMarket/parser.py b/MarketPlaces/HiddenMarket/parser.py index 106cc6d..eb36a5b 100644 --- a/MarketPlaces/HiddenMarket/parser.py +++ b/MarketPlaces/HiddenMarket/parser.py @@ -81,10 +81,7 @@ def hiddenmarket_description_parser(soup): # Finding the Product description describe = bae.find('div', {"class": "twotabs"}).find('div', {'class': "tab1"}).text - describe = describe.replace("\n", " ") - describe = describe.replace("\r", " ") - describe = describe.replace("-", " ") - describe = describe.strip() + describe = cleanString(describe.strip()) # Finding Product Image image = soup.find('div', {"class": "thumbnails"}).find('img', {"class": "bigthumbnail"}) @@ -169,7 +166,6 @@ def hiddenmarket_listing_parser(soup): category.append(cat) - # Adding the url to the list of urls link = card.find_all('a') link = link[1].get('href') @@ -177,7 +173,7 @@ def hiddenmarket_listing_parser(soup): href.append(link) # Finding Product Name - product = card.next_sibling.find('div', {'class': "title"}) + product = card.find('div', {'class': "title"}) product = product.text product = product.replace('\n', ' ') product = product.replace(",", "") @@ -188,7 +184,7 @@ def hiddenmarket_listing_parser(soup): image.append("-1") # Finding Vendor - vendor_name = card.text + vendor_name = card.find('div', {"class": "seller"}).text vendor_name = vendor_name.replace(",", "") vendor_name = vendor_name.strip() vendor.append(vendor_name) @@ -196,12 +192,12 @@ def hiddenmarket_listing_parser(soup): image_vendor.append("-1") # Finding USD - usd = card.next_sibling.find('div', {"class": "buttons"}).find('div', {'class': "price"}).text + usd = card.find('div', {"class": "buttons"}).find('div', {'class': "price"}).text usd = usd.replace("USD", "") usd = usd.strip() USD.append(usd) - tb = card.next_sibling.find("div", {"class": "stats"}) + tb = card.find("div", {"class": "stats"}) tb = tb.find_all('td') # Finding Reviews @@ -221,17 +217,15 @@ def hiddenmarket_listing_parser(soup): if num == '0': item_rating = '-1' else: - item_rating = card.next_sibling.find('div', {'class': 'stats'}).find('div', {'class': "stars2"}) + item_rating = card.find('div', {'class': 'stats'}).find('div', {'class': "stars2"}) item_rating = item_rating.get('style') item_rating = item_rating.replace("width:", "") item_rating = item_rating.replace("%", "") - item_rating = (float(item_rating) * 5.0) / 100.0 - item_rating = "{:.{}f}".format(item_rating, 2) rating_item.append(item_rating) # Finding shipping info - shipping = card.next_sibling.find('div', {'class': "shipping"}).text.split('>') + shipping = card.find('div', {'class': "shipping"}).text.split('>') # SHip from origin = shipping[0].strip() shipFrom.append(origin) diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 985ef69..c5af58b 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -5,6 +5,7 @@ import os import codecs import shutil import traceback +from psycopg2.extras import RealDictCursor from MarketPlaces.DB_Connection.db_connection import * from MarketPlaces.DarkFox.parser import * @@ -298,7 +299,7 @@ def new_parse(marketPlace, url, createLog): # Connecting to the database con = connectDataBase() - cur = con.cursor() + cur = con.cursor(cursor_factory=RealDictCursor) # Creating the tables (The database should be created manually) create_database(cur, con) @@ -420,7 +421,7 @@ def new_parse(marketPlace, url, createLog): if createLog: logFile.write( str(nError) + f". There was a problem to locate the file(s) for {listingFile}" - f" in the Description section!\n") + f" in the Description section!\n\n") if not (readDescriptionError or parseDescriptionError or persistDescriptionError or moveDescriptionError or findDescriptionError): @@ -430,4 +431,7 @@ def new_parse(marketPlace, url, createLog): if createLog: logFile.close() + cur.close() + con.close() + print("Parsing the " + marketPlace + " market and data classification done.") diff --git a/MarketPlaces/LionMarketplace/crawler_selenium.py b/MarketPlaces/LionMarketplace/crawler_selenium.py index d0bac37..e20f630 100644 --- a/MarketPlaces/LionMarketplace/crawler_selenium.py +++ b/MarketPlaces/LionMarketplace/crawler_selenium.py @@ -128,7 +128,21 @@ def getAccess(): def login(driver): # wait for page to show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[2]/div[2]/div[2]/div[1]/div/div[2]/div"))) + (By.XPATH, '//*[@id="username"]'))) + + # entering username and password into input boxes + usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') + # Username here + usernameBox.send_keys('blabri') + passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="password"]') + # Password here + passwordBox.send_keys('fishowal') + + input("Press ENTER when CAPTCHA is completed\n") + + # wait for listing page show up (This Xpath may need to change based on different seed url) + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/nav/div/div/ul[2]/form/button"))) # Saves the crawled html page, makes the directory path for html pages if not made @@ -172,14 +186,10 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # Software/Malware - links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/16') - # # Carding - # links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/20') - # # Hacking - # links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/ba142ac0-c7e7-11ec-9bd1-fdd89c3d3f91') - # # tutorial - # links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/19') + # Hacking + links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/ba142ac0-c7e7-11ec-9bd1-fdd89c3d3f91') + # Digital + links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/12') return links @@ -218,16 +228,16 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - link = driver.find_element(by=By.XPATH, value= - '/html/body/div[2]/div[2]/div/div[2]/nav/ul/li[5]/a').get_attribute('href') + nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div/div[2]/nav') + link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href') if link == "": raise NoSuchElementException count += 1 diff --git a/MarketPlaces/LionMarketplace/parser.py b/MarketPlaces/LionMarketplace/parser.py index a37febf..3b5dc27 100644 --- a/MarketPlaces/LionMarketplace/parser.py +++ b/MarketPlaces/LionMarketplace/parser.py @@ -45,11 +45,16 @@ def lionmarketplace_description_parser(soup): table = soup.find('table') rows = table.findAll('tr') - # successful transaction - success = "-1" - - # vendor rating 5 - rating_vendor = '-1' + # vendor rating + pos = soup.find('span', {"class": "fas fa-plus-circle text-success"}).parent.text + pos = int(pos.strip()) + neu = soup.find('span', {"class": "fas fa-stop-circle text-secondary"}).parent.text + neu = int(neu.strip()) + neg = soup.find('span', {"class": "fas fa-minus-circle text-danger"}).parent.text + neg = int(neg.strip()) + total = pos + neu + neg + if total > 0: + rating_vendor = str((pos + 0.5*neu) / total) # product name temp = soup.find('div', {'class', 'row'}).find('h2').text @@ -64,20 +69,12 @@ def lionmarketplace_description_parser(soup): image = image.get('src') image = image.split('base64,')[-1] - CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) dont worry about that much - MS = "-1" # 6 Product_MS_Classification (Microsoft Security) dont worry about that much - - # product category - temp = rows[1].find('strong').text - category = cleanString(temp.strip()) - - # product number of views - views = "-1" - reviews = "-1" # 9 Product_Number_Of_Reviews - rating_item = "-1" # 10 Product_Rating - addDate = "-1" # 11 Product_AddedDate - # BTC selling price box box-rounded mt-2 - BTC = "-1" + full = rows[0].findAll('i', {"class": "fas fa-star"}) + half = rows[0].find('i', {"class": "fas fa-star-half-alt"}) + rating_item = len(full) + if half is not None: + rating_item += 0.5 + rating_item = str(rating_item) # USD selling price temp = rows[2].find('strong').text @@ -87,37 +84,22 @@ def lionmarketplace_description_parser(soup): temp = temp.replace("$", "") USD = cleanString((temp.strip())) - EURO = "-1" # 14 Product_EURO_SellingPrice - # product sold - if (len(rows) <= 5): - temp = rows[4].find('td').text - string = cleanString(temp) - if (string == 'Left/Sold'): - temp = rows[4].findAll('td') - temp = temp[1].findAll('span') - - # left - temp2 = temp[1].text - temp3 = temp[1].text - - if(" items" in temp2): - temp2 = temp2.replace(" items", "") - if(" items" in temp3): - temp3 = temp3.replace(" items", "") - - sold = (cleanString(temp2.strip())) - left = cleanString(temp3.strip()) - else: - sold = '-1' - left = "-1" + temp = rows[4].find('td') + if temp is not None and cleanString(temp.text.strip()) == 'Left/Sold': + temp = rows[4].findAll('td') + temp = temp[1].findAll('span') + + # left + sold = temp[1].text + left = temp[0].text + + sold = cleanNumbers(sold.strip()) + left = cleanNumbers(left.strip()) else: sold = '-1' left = "-1" - shipFrom = "-1" # 17 Product_ShippedFrom - shipTo = "-1" # 18 Product_ShippedTo - # Populating the final variable (this should be a list with all fields scraped) row = (vendor, rating_vendor, success, name, describe, CVE, MS, category, views, reviews, rating_item, addDate, BTC, USD, EURO, sold, left, shipFrom, shipTo, image, vendor_image) @@ -195,14 +177,16 @@ def lionmarketplace_listing_parser(soup): MS.append('-1') # product category - temp = row[2].text + temp = row[1].text temp = temp.replace("Category: ", "") category.append(cleanString(temp.strip())) describe.append('-1') # product views - views.append("-1") + vnum = listing.find('p', {"class": "position-absolute bg-primary opacity-60 text-white mt-4 mr-5 pr-1"}).text + views.append(cleanNumbers(vnum.strip())) + reviews.append('-1') # 10 Product_Number_Of_Reviews rating_item.append('-1') # 11 Product_Rating addDate.append('-1') # 12 Product_AddDate @@ -212,9 +196,7 @@ def lionmarketplace_listing_parser(soup): # USD temp = row[0].find('strong').text - if ' $' in temp: - temp = temp.replace(" $", "") - USD.append(cleanString(temp.strip())) # 14 Product_USD_SellingPrice + USD.append(cleanNumbers(temp.strip())) # 14 Product_USD_SellingPrice EURO.append("-1") # 15 Product_EURO_SellingPrice diff --git a/MarketPlaces/MetaVerseMarket/crawler_selenium.py b/MarketPlaces/MetaVerseMarket/crawler_selenium.py index d5783a4..44eb335 100644 --- a/MarketPlaces/MetaVerseMarket/crawler_selenium.py +++ b/MarketPlaces/MetaVerseMarket/crawler_selenium.py @@ -2,8 +2,6 @@ __author__ = 'Helium' ''' MetaVerseMarket Marketplace Crawler (Selenium) -not complete -need to go through multiple pages... ''' from selenium import webdriver @@ -128,6 +126,8 @@ def getAccess(): # then allows for manual solving of captcha in the terminal #@param: current selenium web driver def login(driver): + WebDriverWait(driver, 100).until(EC.visibility_of_element_located( + (By.XPATH, '//*[@id="username"]'))) # entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') @@ -184,12 +184,12 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # hacking - links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hacking') - # # hosting - # links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hosting') - # # hacking guides and tutorials - # links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/hacking-guides-and-tutorials') + # software and malware + links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/softwares-and-malwares') + # guides and tutorials + links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/guides-and-tutorials') + # services + links.append('http://mdbvvcfwl3fpckiraucv7gio57yoslnhfjxzpoihf4fgdkdd7bwyv7id.onion/products/services') return links @@ -228,16 +228,16 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: link = driver.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href') - if link == "": + if link.endswith('#') or link == "": raise NoSuchElementException count += 1 diff --git a/MarketPlaces/MetaVerseMarket/parser.py b/MarketPlaces/MetaVerseMarket/parser.py index c43b823..5c12390 100644 --- a/MarketPlaces/MetaVerseMarket/parser.py +++ b/MarketPlaces/MetaVerseMarket/parser.py @@ -40,40 +40,48 @@ def metaversemarket_description_parser(soup): name = soup.find('div', {'class': "panel-heading"}).text name = cleanString(name.strip()) - # Finding Vendor temp = soup.findAll('div', {'class': "col-xs-12 col-sm-6 mt-5"}) + + # Finding Product Image + image = temp[0].find('img') + image = image.get('src') + image = image.split('base64,')[-1] + + # Finding Vendor temp = temp[1].findAll('span') vendor = temp[1].find('b').text vendor = cleanString(vendor.strip()) - # Finding Product Reviews - reviews = soup.find('span', {'class': "badge bg-success fs-12px"}).text.strip() - - # Finding Successful Transactions - # NA + # Finding Vendor Rating + pos = soup.find('span', {'class': "badge bg-success fs-12px"}).text + pos = int(cleanNumbers(pos).strip()) + neg = soup.find('span', {'class': "badge bg-danger fs-12px"}).text + neg = int(cleanNumbers(neg).strip()) + total = pos + neg + if total > 0: + rating_vendor = str(pos / total) # Finding Prices USD = soup.find('h3', {'class': "mb-2"}).text - USD = USD.replace("Price: $", "").strip() + USD = cleanNumbers(USD).strip() # Finding the Product Category temp = soup.select('div[class="mt-2"]')[1].text temp = temp.replace("Category:", "") category = temp.strip() + # Finding Number of Views + views = soup.find('button', {"class": "btn btn-secondary text-center w-33 fw-bold"}).text + views = views.strip() + # Finding the Product Quantity Available - # temp = soup.find('em', {'class': "icon ni ni-layers-fill"}).parent.parent.parent - # left = temp.text - # left = left.replace("Supply:", "") - # left = left.strip() - temp = soup.findAll('span', {'class': "badge bg-success"}) - temp = temp[1].text.split("/") + temp = soup.find('button', {"class": "btn btn-success text-center w-33 fw-bold"}).text + temp = temp.split("/") left = temp[1].strip() # Finding Number Sold sold = temp[0].strip() - # Finding Shipment Information (Origin) temp = soup.find('div', {'class': "alert alert-info"}).text temp = temp.split("to") @@ -85,21 +93,7 @@ def metaversemarket_description_parser(soup): # Finding the Product description describe = soup.find('p', {'class': "card-text"}).text - describe = describe.replace("\n", " ") - describe = describe.strip() - - '''# Finding the Number of Product Reviews - tag = soup.findAll(text=re.compile('Reviews')) - for index in tag: - reviews = index - par = reviews.find('(') - if par >=0: - reviews = reviews.replace("Reviews (","") - reviews = reviews.replace(")","") - reviews = reviews.split(",") - review = str(abs(int(reviews[0])) + abs(int(reviews[1]))) - else : - review = "-1"''' + describe = cleanString(describe.strip()) # Searching for CVE and MS categories cve = soup.findAll(text=re.compile('CVE-\d{4}-\d{4}')) @@ -177,10 +171,7 @@ def metaversemarket_listing_parser(soup): # Finding Prices price = a.find('strong').text - price = price.replace("Buy for $", "") - price = price.strip() - USD.append(price) - + USD.append(cleanNumbers(price).strip()) # Finding the Vendor temp = a.find('div', {'class': "mt-1 fs-12px"}) @@ -194,35 +185,30 @@ def metaversemarket_listing_parser(soup): cat = cat.strip() category.append(cat) - badge = a.find('span', {'class': "badge bg-success"}) + ul = a.find('ul', {"class": "product-actions"}) + # Finding Number Sold and Quantity Left - temp = badge.text + temp = ul.find('span', {'class': "badge bg-success"}).text temp = temp.split("/") num = temp[0] - num = num.strip() - sold.append(num) + num = num.replace('k', '000') + sold.append(cleanNumbers(num).strip()) quant = temp[1] - quant = quant.strip() - qLeft.append(quant) - - # Finding Successful Transactions - # NA - - # Finding Product review - review = a.find('span', {'class': "badge bg-success fs-12px"}).text - review = review.replace("+ ", "") - reviews.append(review) + quant = quant.replace('k', '000') + qLeft.append(cleanNumbers(quant).strip()) # Finding Descrption - description = a.find('p', {'class': "alert alert-light text-ssbold p-1"}).text - description = description.replace("\n", " ") - description = description.strip() - describe.append(cleanString(description)) + # description = a.find('p', {'class': "alert alert-light text-ssbold p-1"}).text + # description = description.replace("\n", " ") + # description = description.strip() + # describe.append(cleanString(description)) # Finding Number of Views - view = a.find('span', {'class': "badge bg-primary"}).text.strip() - views.append(view) + view = ul.find('span', {'class': "badge bg-primary"}).text + view = view.replace('.', '') + view = view.replace('K', '000') + views.append(view.strip()) # Find where ships from ships = a.find('div', {'class': "alert alert-info item_alert fs-12px p-1"}) diff --git a/MarketPlaces/Nexus/crawler_selenium.py b/MarketPlaces/Nexus/crawler_selenium.py index 4ae7cfe..bd76f59 100644 --- a/MarketPlaces/Nexus/crawler_selenium.py +++ b/MarketPlaces/Nexus/crawler_selenium.py @@ -36,6 +36,7 @@ def startCrawling(): if driver != 'down': try: + input("Press ENTER when page loads after DDOS protection") crawlForum(driver) except Exception as e: print(driver.current_url, e) @@ -163,22 +164,22 @@ def getInterestedLinks(): # malware links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/') - # # hacking-spam - # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/') - # # hacking services - # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/hacking/') - # # programming services - # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/programacao/') - # # remote admin services - # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/administracao-remota/') - # # hacking guides - # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-hacking/') - # # malware guides - # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-malware/') - # # fraud guides - # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-fraudes/') - # # fraud software - # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/fraudes/software-de-fraude/') + # hacking-spam + links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/') + # hacking services + links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/hacking/') + # programming services + links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/programacao/') + # remote admin services + links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/servicos/administracao-remota/') + # hacking guides + links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-hacking/') + # malware guides + links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-malware/') + # fraud guides + links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/guias-tutoriais/guia-de-fraudes/') + # fraud software + links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/fraudes/software-de-fraude/') return links @@ -206,9 +207,12 @@ def crawlForum(driver): driver.refresh() # waiting for btc price to load - WebDriverWait(driver, 30).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/ul/li[1]/div/span/span[3]"))) - time.sleep(5) + try: + WebDriverWait(driver, 1).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/ul/li[1]/div/span/span[3]"))) + time.sleep(5) + except: + pass html = driver.page_source savePage(driver, html, link) @@ -222,18 +226,21 @@ def crawlForum(driver): driver.refresh() # waiting for btc price to load - WebDriverWait(driver, 30).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/div[3]/div[2]/p/span[3]"))) + try: + WebDriverWait(driver, 1).until(EC.visibility_of_element_located( + (By.XPATH, "/html/body/div[1]/div[2]/div/div/main/div[3]/div[2]/p/span[3]"))) + except: + pass savePage(driver, driver.page_source, item) driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: link = driver.find_element(by=By.LINK_TEXT, value='→').get_attribute('href') diff --git a/MarketPlaces/Nexus/parser.py b/MarketPlaces/Nexus/parser.py index f673110..107a80a 100644 --- a/MarketPlaces/Nexus/parser.py +++ b/MarketPlaces/Nexus/parser.py @@ -8,6 +8,9 @@ from bs4 import BeautifulSoup import re +usd_to_brl_r = None + + #parses description pages, so takes html pages of description pages using soup object, and parses it for info it needs #stores info it needs in different lists, these lists are returned after being organized #@param: soup object looking at html page of description page @@ -43,10 +46,19 @@ def nexus_description_parser(soup): name_of_product = soup.find("h1", {"class": "product_title entry-title"}).text name = cleanString(name_of_product.strip()) + # Finding USD Price + real = soup.find('span', {"class": "price"}).find('bdi').text + real = real.split(',') + whole = cleanNumbers(real[0]).replace('.', '') + real = whole + '.' + real[1] + usd = float(real) / usd_to_brl_r + USD = str(round(usd, 2)) + # Find the BTC Price prices = soup.find('p', {"class": "price"}).findAll('span', {"class": "cs"}) - BTC = prices[0].text - BTC = cleanNumbers(BTC.strip()) + if len(prices) > 0: + BTC = prices[0].text + BTC = cleanNumbers(BTC.strip()) # finding the description of the product description_div = soup.find("div", {"class": "woocommerce-product-details__short-description"}) @@ -88,6 +100,13 @@ def nexus_description_parser(soup): #return: 'row' that contains a variety of lists that each hold info on the listing page def nexus_listing_parser(soup): + global usd_to_brl_r + while usd_to_brl_r is None: + try: + usd_to_brl_r = float(input("1 US Dollar = (Brazilian Real) ")) + except ValueError: + pass + # Fields to be parsed nm = 0 # *Total_Products (Should be Integer) mktName = "Nexus" # 0 *Marketplace_Name @@ -138,14 +157,22 @@ def nexus_listing_parser(soup): product_image = product_image.split('base64,')[-1] image.append(product_image) + # Finding USD Price + real = product.find('span', {"class": "price"}).find('bdi').text + real = real.split(',') + whole = cleanNumbers(real[0]).replace('.', '') + real = whole + '.' + real[1] + usd = float(real) / usd_to_brl_r + USD.append(str(round(usd, 2))) + # Finding BTC Price prices = product.find('span', {"class": "price"}).findAll('span', {"class": "cs"}) - price = prices[0].text - BTC.append(cleanNumbers(price.strip())) + if len(prices) > 0: + price = prices[0].text + BTC.append(cleanNumbers(price.strip())) #everything else appends a -1 rating_vendor.append("-1") - USD.append("-1") vendor.append('-1') success.append("-1") CVE.append("-1") diff --git a/MarketPlaces/RobinhoodMarket/parser.py b/MarketPlaces/RobinhoodMarket/parser.py index c036d17..059d327 100644 --- a/MarketPlaces/RobinhoodMarket/parser.py +++ b/MarketPlaces/RobinhoodMarket/parser.py @@ -115,7 +115,7 @@ def Robinhood_listing_parser(soup): # Fields to be parsed nm = 0 # *Total_Products (Should be Integer) - mktName = "Robinhood Market" # 0 *Marketplace_Name + mktName = "RobinhoodMarket" # 0 *Marketplace_Name vendor = [] # 1 *Vendor y rating_vendor = [] # 2 Vendor_Rating success = [] # 3 Vendor_Successful_Transactions diff --git a/MarketPlaces/ThiefWorld/crawler_selenium.py b/MarketPlaces/ThiefWorld/crawler_selenium.py index af5a456..95db8ff 100644 --- a/MarketPlaces/ThiefWorld/crawler_selenium.py +++ b/MarketPlaces/ThiefWorld/crawler_selenium.py @@ -182,12 +182,12 @@ def getInterestedLinks(): # Hacking and DOSS links.append(['Hacking and DOSS', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/35']) - # # Carding Manuals - # links.append('http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/20') - # # Software - # links.append('http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/37') - # # Database - # links.append('http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/38') + # Carding Manuals + links.append(['Carding Manuals', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/20']) + # Software + links.append(['Software', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/37']) + # Database + links.append(['Database', 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/38']) return links @@ -228,16 +228,17 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - link = driver.find_element(by=By.XPATH, value= - '/html/body/div/div[1]/div/div/div[2]/div[3]/div/ul/li[13]/a').get_attribute('href') + nav = driver.find_element(by=By.XPATH, value='/html/body/div/div[1]/div/div/div[2]/div[3]') + right = nav.find_element(by=By.CLASS_NAME, value='pag_right') + link = right.find_element(by=By.TAG_NAME, value='a').get_attribute('href') if link == "": raise NoSuchElementException count += 1 diff --git a/MarketPlaces/Tor2door/crawler_selenium.py b/MarketPlaces/Tor2door/crawler_selenium.py index 36a3e63..17988be 100644 --- a/MarketPlaces/Tor2door/crawler_selenium.py +++ b/MarketPlaces/Tor2door/crawler_selenium.py @@ -47,7 +47,7 @@ def startCrawling(): def login(driver): #wait for login page WebDriverWait(driver, 100).until(EC.visibility_of_element_located( - (By.XPATH, "/html/body/main/div/div/div/div/div/h5"))) + (By.XPATH, '//*[@id="username"]'))) #entering username and password into input boxes usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="username"]') @@ -198,16 +198,16 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Digital - Guides - Hacking - # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=55') - # # Digital - Guides - Others - # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=57') - # # Digital - Software - # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=60') + # Digital - Guides - Hacking + links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=55') + # Digital - Guides - Others + links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=57') + # Digital - Software + links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=60') # Software - Malware links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=69') - # # Software - Others - # links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=78') + # Software - Others + links.append('http://4rnsxgoijvnb6i6ujchlmudfobk7scdxpewf4vb2osdxuzo4rmkucpad.onion/en/products?category=78') return links @@ -243,12 +243,12 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: nav = driver.find_element(by=By.XPATH, value= diff --git a/MarketPlaces/TorBay/crawler_selenium.py b/MarketPlaces/TorBay/crawler_selenium.py index b830acb..18a04be 100644 --- a/MarketPlaces/TorBay/crawler_selenium.py +++ b/MarketPlaces/TorBay/crawler_selenium.py @@ -213,16 +213,16 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: - link = driver.find_element(by=By.XPATH, value= - '/html/body/section/div/div/div[2]/div/div[2]/ul/li[3]/a').get_attribute('href') + nav = driver.find_element(by=By.XPATH, value='/html/body/section/div/div/div[2]/div/div[2]/ul') + link = nav.find_element(by=By.PARTIAL_LINK_TEXT, value='Next').get_attribute('href') if link == "": raise NoSuchElementException count += 1 diff --git a/MarketPlaces/TorBay/parser.py b/MarketPlaces/TorBay/parser.py index f20b7c9..69d2cfb 100644 --- a/MarketPlaces/TorBay/parser.py +++ b/MarketPlaces/TorBay/parser.py @@ -163,6 +163,8 @@ def torbay_listing_parser(soup): # Populate the final variable (this should be a list with all fields scraped) return organizeProducts(mktName, nm, vendor, rating_vendor, success, name, CVE, MS, category, describe, views, reviews, rating_item, addDate, BTC, USD, EURO, sold, qLeft, shipFrom, shipTo, href, image, image_vendor) + + #called by the crawler to get description links on a listing page #@param: beautifulsoup object that is using the correct html page (listing page) #return: list of description links from a listing page diff --git a/MarketPlaces/TorMarket/crawler_selenium.py b/MarketPlaces/TorMarket/crawler_selenium.py index 86fde52..7021abc 100644 --- a/MarketPlaces/TorMarket/crawler_selenium.py +++ b/MarketPlaces/TorMarket/crawler_selenium.py @@ -170,12 +170,12 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # # Tutorials - # links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/') + # Tutorials + links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/') # Malware links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/malware/') - # # Services - # links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/') + # Services + links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/') return links @@ -214,12 +214,12 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: link = driver.find_element(by=By.LINK_TEXT, value='NEXT').get_attribute('href') diff --git a/MarketPlaces/TorMarket/parser.py b/MarketPlaces/TorMarket/parser.py index 417f8ac..6a6fac0 100644 --- a/MarketPlaces/TorMarket/parser.py +++ b/MarketPlaces/TorMarket/parser.py @@ -41,28 +41,28 @@ def tormarket_description_parser(soup): #finding the name of the product name_of_product = soup.find("h1", {"class": "product_title entry-title"}).find("a").text name = cleanString(name_of_product.strip()) + #finding the description of the product description_of_product = soup.find("div", {"class": "woocommerce-product-details__short-description"}).text describe = cleanString(description_of_product.strip()) - #finding the replies - inquires_about_product = soup.find("div", {"class": "woocommerce-Tabs-panel woocommerce-Tabs-panel--wcfm_enquiry_tab panel entry-content wc-tab"}).find("p").text - if inquires_about_product == "There are no inquiries yet.": - review = 0 - else: - review = "-1" #fix later pls - - #finding the terms and conditions - terms_and_conditions = soup.find("div", {"class": "woocommerce-Tabs-panel woocommerce-Tabs-panel--wcfm_enquiry_tab panel entry-content wc-tab"}).find("p").text - term = cleanString(terms_and_conditions) #finding the name of the vendor - name_of_vendor = soup.find("div", {"class": "wcfmmp_sold_by_store"}).find("a").text - vendor = cleanString(name_of_vendor) + name_of_vendor = soup.find("div", {"class": "wcfmmp_sold_by_store"}) + if name_of_vendor is not None: + name_of_vendor = name_of_vendor.find("a").text + vendor = cleanString(name_of_vendor.strip()) + else: + vendor = "TorMarket" #finding the price of the item price = soup.find("p", {"class": "price"}).find("bdi").text price_cleaned = price[1:] USD = price_cleaned.strip() + + category = soup.find('span', {"class": "posted_in"}).text + category = category.split(':')[-1] + category = category.replace(',', '/') + category = cleanString(category.strip()) #everything else gets a -1 because they are not found # Populating the final variable (this should be a list with all fields scraped) @@ -128,16 +128,24 @@ def tormarket_listing_parser(soup): rating_item.append(cleanString(rating_score_of_product.strip())) # print("done") #finding the rating of the vendors - rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"}).find("strong").text - rating_vendor.append(cleanString(rating_score_of_vendor.strip())) + rating_score_of_vendor = product.find("div", {"class": "wcfmmp-store-rating"}) + if rating_score_of_vendor is not None: + rating_score_of_vendor = rating_score_of_vendor.find("strong").text + rating_vendor.append(cleanString(rating_score_of_vendor.strip())) + else: + rating_vendor.append('-1') # print("done") #finding the cost in USD cost = product.find("span", {"class": "woocommerce-Price-amount amount"}).text USD.append(cost) # print("done") #finding the name of the vendor - vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"}).find("a").text - vendor.append(cleanString(vendor_name.strip())) + vendor_name = product.find("div", {"class": "wcfmmp_sold_by_wrapper"}) + if vendor_name is not None: + vendor_name = vendor_name.find("a").text + vendor.append(cleanString(vendor_name.strip())) + else: + vendor.append(mktName) # print("done") #everything else appends a -1 success.append("-1") diff --git a/MarketPlaces/Utilities/utilities.py b/MarketPlaces/Utilities/utilities.py index 563ffe3..915f284 100644 --- a/MarketPlaces/Utilities/utilities.py +++ b/MarketPlaces/Utilities/utilities.py @@ -246,12 +246,10 @@ def organizeProducts(marketplace, nm, vendor, rating_vendor, success_vendor, nom current_time = datetime.now() day = current_time.strftime("%m/%d/%Y") + ahora = current_time.strftime("%I:%M:%S") for n in range(nm): - current_time += timedelta(seconds=2) - ahora = current_time.strftime("%I:%M:%S") - lne = marketplace # 0 lne += "," lne += vendor[n] # 1