diff --git a/Forums/BestCardingWorld/__init__.py b/Forums/BestCardingWorld/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/Forums/CryptBB/__init__.py b/Forums/CryptBB/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/Forums/CryptBB/captcha.png b/Forums/CryptBB/captcha.png deleted file mode 100644 index 08e45fc..0000000 Binary files a/Forums/CryptBB/captcha.png and /dev/null differ diff --git a/Forums/Initialization/forums_mining.py b/Forums/Initialization/forums_mining.py index 38a52c8..311ac6c 100644 --- a/Forums/Initialization/forums_mining.py +++ b/Forums/Initialization/forums_mining.py @@ -135,7 +135,7 @@ if __name__ == '__main__': elif forum == 'Libre': crawlerLibre() - print("Scraping process completed!") + print("\nScraping process completed!") diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 10d5f0d..272cb44 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -112,8 +112,6 @@ def parse_listing(forum, listingFile, soup, createLog, logFile): try: - rw = [] - if forum == "BestCardingWorld": rw = bestcardingworld_listing_parser(soup) elif forum == "Cardingleaks": @@ -128,16 +126,19 @@ def parse_listing(forum, listingFile, soup, createLog, logFile): rw = procrax_listing_parser(soup) elif forum == "Libre": rw = libre_listing_parser(soup) + else: + print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") + raise Exception return rw except: incrementError() - print("There was a problem to read the file " + listingFile + " in the listing section!") + print("There was a problem to parse the file " + listingFile + " in the listing section!") traceback.print_exc() if createLog: logFile.write( - str(nError) + ". There was a problem to read the file " + listingFile + " in the Listing section.\n") + str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n") return None @@ -145,8 +146,6 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile): try: - rmm = [] - if forum == "BestCardingWorld": rmm = bestcardingworld_description_parser(soup) elif forum == "Cardingleaks": @@ -161,6 +160,9 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile): rmm = procrax_description_parser(soup) elif forum == "Libre": rmm = libre_description_parser(soup) + else: + print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") + raise Exception return rmm except: @@ -239,7 +241,9 @@ def new_parse(forum, url, createLog): logFile = open(mainDir + f"/{CURRENT_DATE}/" + forum + "_" + CURRENT_DATE + ".log", "w") except: print("Could not open log file!") - raise SystemExit + createLog = False + logFile = None + # raise SystemExit else: logFile = None diff --git a/MarketPlaces/AnonymousMarketplace/crawler_selenium.py b/MarketPlaces/AnonymousMarketplace/crawler_selenium.py index ef3b475..99b4431 100644 --- a/MarketPlaces/AnonymousMarketplace/crawler_selenium.py +++ b/MarketPlaces/AnonymousMarketplace/crawler_selenium.py @@ -32,19 +32,19 @@ baseURL = 'http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() + # opentor() mktName = getMKTName() driver = getAccess() if driver != 'down': try: - login(driver) + # login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) closetor(driver) - # new_parse(mktName, baseURL, False) + new_parse(mktName, baseURL, True) # Opens Tor Browser @@ -104,7 +104,7 @@ def createFFDriver(): ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 2) + ff_prof.set_preference("permissions.default.image", 1) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") @@ -120,6 +120,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -185,11 +187,11 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # carding - links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/carding/') + # # carding + # links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/carding/') # # hacked paypal - links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacked-paypal-accounts/') - # # hacking services + # links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacked-paypal-accounts/') + # hacking services links.append('http://3fqr7fgjaslhgmeiin5e2ky6ra5xkiafyzg7i36sfcehv3jvpgydteqd.onion/product-category/hacking-services/') return links @@ -250,7 +252,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling AnonymousMarketplace forum done sucessfully. Press ENTER to continue\n") + print("Crawling the AnonymousMarketplace market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/AnonymousMarketplace/parser.py b/MarketPlaces/AnonymousMarketplace/parser.py index da11774..08dbaa3 100644 --- a/MarketPlaces/AnonymousMarketplace/parser.py +++ b/MarketPlaces/AnonymousMarketplace/parser.py @@ -88,14 +88,14 @@ def anonymousMarketplace_listing_parser(soup: Tag): href = [] # 20 Product_Links - product_list: ResultSet[Tag] = soup.find("ul", {"class": "products columns-4"}).find_all("li") + product_list: ResultSet[Tag] = soup.find("ul", {"class": "product_list_widget"}).find_all("li") for item in product_list: item_href = item.find("a").get("href") href.append(item_href) - item_name = item.find("h2", {"class": "woocommerce-loop-product__title"}).text - name.append(cleanString('item_name'.strip())) + item_name = item.find("span", {"class": "product-title"}).text + name.append(cleanString(item_name.strip())) item_rating = item.find("div", {"class": "star-rating"}).find("strong", {"class": "rating"}).text rating_item.append(cleanString(item_rating.strip())) @@ -167,10 +167,10 @@ def anonymous_links_parser(soup): # Returning all links that should be visited by the Crawler href = [] - listing = soup.find('ul', {"class": "products columns-4"}).findAll('li') + listing = soup.find('ul', {"class": "product_list_widget"}).findAll('li') for a in listing: - bae = a.find('a', {"class": "woocommerce-LoopProduct-link woocommerce-loop-product__link"}, href=True) + bae = a.find('a', href=True) link = bae['href'] href.append(link) diff --git a/MarketPlaces/Apocalypse/crawler_selenium.py b/MarketPlaces/Apocalypse/crawler_selenium.py index 7b67cc4..c1f579d 100644 --- a/MarketPlaces/Apocalypse/crawler_selenium.py +++ b/MarketPlaces/Apocalypse/crawler_selenium.py @@ -34,17 +34,17 @@ baseURL = 'http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion def startCrawling(): # opentor() mktName = getMKTName() - # driver = getAccess() + driver = getAccess() - # if driver != 'down': - # try: - # login(driver) - # crawlForum(driver) - # except Exception as e: - # print(driver.current_url, e) - # closetor(driver) + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) - new_parse(mktName, baseURL, False) + new_parse(mktName, baseURL, True) # Opens Tor Browser @@ -120,6 +120,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -201,8 +203,8 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # Hacking Services - links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/19') + # # Hacking Services + # links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/19') # software and malware links.append('http://apocam5hnoqskkmhr325nivjuh5phbmmggadxgcjabzzirap5iklkxad.onion/subcategory/30') @@ -244,7 +246,7 @@ def crawlForum(driver): driver.back() # comment out - break + # break # comment out if count == 1: @@ -264,7 +266,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Apocalypse forum done sucessfully. Press ENTER to continue\n") + print("Crawling the Apocalypse market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/BlackPyramid/crawler_selenium.py b/MarketPlaces/BlackPyramid/crawler_selenium.py index dd3e251..ef7d7e0 100644 --- a/MarketPlaces/BlackPyramid/crawler_selenium.py +++ b/MarketPlaces/BlackPyramid/crawler_selenium.py @@ -26,8 +26,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.BlackPyramid.parser import blackpyramid_links_parser from MarketPlaces.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/login/' @@ -35,8 +33,8 @@ baseURL = 'http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() - # mktName = getMKTName() + # opentor() + mktName = getMKTName() driver = getAccess() if driver != 'down': @@ -47,12 +45,14 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - # new_parse(forumName, baseURL, False) + new_parse(mktName, baseURL, True) # Opens Tor Browser #prompts for ENTER input to continue def opentor(): + from MarketPlaces.Initialization.markets_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -91,6 +91,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -119,6 +121,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -171,12 +175,14 @@ def savePage(driver, page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import CURRENT_DATE + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\BlackPyramid\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath @@ -210,6 +216,7 @@ def getInterestedLinks(): # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') # # Services # links.append('http://blackpyoc3gbnrlvxqvvytd3kxqj7pd226i2gvfyhysj24ne2snkmnyd.onion/search/results/') + return links @@ -270,7 +277,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling BlackPyramid forum done sucessfully. Press ENTER to continue\n") + print("Crawling the BlackPyramid market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/CityMarket/crawler_selenium.py b/MarketPlaces/CityMarket/crawler_selenium.py index 6fbc683..4819f2a 100644 --- a/MarketPlaces/CityMarket/crawler_selenium.py +++ b/MarketPlaces/CityMarket/crawler_selenium.py @@ -33,8 +33,8 @@ baseURL = 'http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() - # mktName = getMKTName() + # opentor() + mktName = getMKTName() driver = getAccess() if driver != 'down': @@ -45,7 +45,7 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - # new_parse(forumName, baseURL, False) + new_parse(mktName, baseURL, True) # Opens Tor Browser @@ -121,6 +121,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -198,18 +200,18 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # Hiring hacker - links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=3') + # # Hiring hacker + # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=3') # virus and malware links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=15') - # ddos - links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=16') - # software - links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=17') - # botnets - links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=18') - # hacking service - links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=31') + # # ddos + # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=16') + # # software + # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=17') + # # botnets + # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=18') + # # hacking service + # links.append('http://wsptlnuoo3johqzcdlwuj5zcwfh2dwmswz6hahqctuxttvxpanypmwad.onion/?sub_id=31') return links @@ -269,7 +271,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling CityMarket forum done sucessfully. Press ENTER to continue\n") + print("Crawling the CityMarket market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/CypherMarketplace/crawler_selenium.py b/MarketPlaces/CypherMarketplace/crawler_selenium.py index b39a74a..2219af5 100644 --- a/MarketPlaces/CypherMarketplace/crawler_selenium.py +++ b/MarketPlaces/CypherMarketplace/crawler_selenium.py @@ -32,8 +32,8 @@ baseURL = 'http://6c5qa2ke2esh6ake6u6yoxjungz2czbbl7hqxl75v5k37frtzhxuk7ad.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() - # mktName = getMKTName() + # opentor() + mktName = getMKTName() driver = getAccess() if driver != 'down': @@ -44,7 +44,7 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - # new_parse(forumName, baseURL, False) + new_parse(mktName, baseURL, True) # Opens Tor Browser @@ -120,6 +120,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -263,7 +265,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling CypherMarketplace forum done sucessfully. Press ENTER to continue\n") + print("Crawling the CypherMarketplace market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/DarkFox/captcha.png b/MarketPlaces/DarkFox/captcha.png deleted file mode 100644 index d1223da..0000000 Binary files a/MarketPlaces/DarkFox/captcha.png and /dev/null differ diff --git a/MarketPlaces/DarkFox/crawler_selenium.py b/MarketPlaces/DarkFox/crawler_selenium.py index 0f7ee1d..8e1ca7b 100644 --- a/MarketPlaces/DarkFox/crawler_selenium.py +++ b/MarketPlaces/DarkFox/crawler_selenium.py @@ -30,7 +30,7 @@ baseURL = 'http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() + # opentor() mktName = getMKTName() driver = getAccess() @@ -42,7 +42,7 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - new_parse(mktName, baseURL, False) + new_parse(mktName, baseURL, True) # Opens Tor Browser @@ -124,6 +124,7 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() return driver @@ -145,6 +146,7 @@ def getAccess(): # then allows for manual solving of captcha in the terminal #@param: current selenium web driver def captcha(driver): + ''' # wait for captcha page show up WebDriverWait(driver, 100).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div/div/form/button[1]"))) @@ -168,6 +170,9 @@ def captcha(driver): # click the verify(submit) button driver.find_element(by=By.XPATH, value="/html/body/div/div/form/button[1]").click() + ''' + + input("Press ENTER when CAPTCHA is completed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( @@ -220,8 +225,7 @@ def getInterestedLinks(): # # Digital Products # links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/0e384d5f-26ef-4561-b5a3-ff76a88ab781') # Software and Malware - # links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc') - links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc?page=15') + links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/6b71210f-f1f9-4aa3-8f89-bd9ee28f7afc') # # Services # links.append('http://57d5j6bbwlpxbxe5tsjjy3vziktv3fo2o5j3nheo4gpg6lzpsimzqzid.onion/category/b9dc5846-5024-421e-92e6-09ba96a03280') # # Miscellaneous @@ -270,7 +274,7 @@ def crawlForum(driver): break # comment out - if count == 0: + if count == 1: break try: @@ -287,7 +291,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling BestCardingWorld forum done sucessfully. Press ENTER to continue\n") + print("Crawling the DarkFox market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/DarkMatter/crawler_selenium.py b/MarketPlaces/DarkMatter/crawler_selenium.py index c1eb457..e0babcb 100644 --- a/MarketPlaces/DarkMatter/crawler_selenium.py +++ b/MarketPlaces/DarkMatter/crawler_selenium.py @@ -32,7 +32,7 @@ baseURL = 'http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() + # opentor() mktName = getMKTName() driver = getAccess() @@ -44,7 +44,7 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - new_parse(mktName, baseURL, False) + new_parse(mktName, baseURL, True) # Opens Tor Browser @@ -121,6 +121,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -185,15 +187,15 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # digital fraud software - links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=76') - # legit - links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=78') + # # digital fraud software + # links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=76') + # # legit + # links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=78') # # hack guides - links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=94') + # links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=94') # # services - links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=117') - # # software/malware + # links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=117') + # software/malware links.append('http://darkmat3kdxestusl437urshpsravq7oqb7t3m36u2l62vnmmldzdmid.onion/market/products/?category=121') return links @@ -236,16 +238,14 @@ def crawlForum(driver): driver.back() # to keep from detecting click speed - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + break + + # comment out + if count == 1: + break try: - # nav = driver.find_element(by=By.XPATH, value='/html/body/table[1]/tbody/tr/td/form/div/div[2]/table[2]') - # a = nav.find_element(by=By.LINK_TEXT, value=">") link = driver.find_element(by=By.LINK_TEXT, value=">").get_attribute('href') if link == "": raise NoSuchElementException @@ -258,7 +258,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling DarkMatter forum done sucessfully. Press ENTER to continue\n") + print("Crawling the DarkMatter market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/DarkTor/crawler_selenium.py b/MarketPlaces/DarkTor/crawler_selenium.py index 74e22be..24c2990 100644 --- a/MarketPlaces/DarkTor/crawler_selenium.py +++ b/MarketPlaces/DarkTor/crawler_selenium.py @@ -31,8 +31,8 @@ baseURL = 'http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() - # mktName = getMKTName() + # opentor() + mktName = getMKTName() driver = getAccess() if driver != 'down': @@ -43,7 +43,7 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - # new_parse(forumName, baseURL, False) + new_parse(mktName, baseURL, True) # Opens Tor Browser @@ -119,6 +119,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -186,10 +188,10 @@ def getInterestedLinks(): # Hacking links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/hacking-services/') - # Carding - links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/carding/') - # hacked paypals - links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/hacked-paypal-accounts/') + # # Carding + # links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/carding/') + # # hacked paypals + # links.append('http://zuauw53dukqdmll5p3fld26ns2gepcyfmbofobjczdni6ecmkoitnfid.onion/product-category/hacked-paypal-accounts/') return links @@ -248,7 +250,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling DarkTor forum done sucessfully. Press ENTER to continue\n") + print("Crawling the DarkTor market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/DigitalThriftShop/crawler_selenium.py b/MarketPlaces/DigitalThriftShop/crawler_selenium.py index 28424a8..132d2af 100644 --- a/MarketPlaces/DigitalThriftShop/crawler_selenium.py +++ b/MarketPlaces/DigitalThriftShop/crawler_selenium.py @@ -34,17 +34,17 @@ baseURL = 'http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion def startCrawling(): # opentor() mktName = getMKTName() - # driver = getAccess() + driver = getAccess() - # if driver != 'down': - # try: - # login(driver) - # crawlForum(driver) - # except Exception as e: - # print(driver.current_url, e) - # closetor(driver) + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) - new_parse(mktName, baseURL, False) + new_parse(mktName, baseURL, True) # Opens Tor Browser @@ -91,7 +91,6 @@ def closetor(driver): def createFFDriver(): from MarketPlaces.Initialization.markets_mining import config - ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -120,6 +119,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -189,8 +190,8 @@ def getInterestedLinks(): links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/botnets/') # # data leak # links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/dataleak/') - # databases - links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/databases/') + # # databases + # links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/databases/') # # ransomware # links.append('http://kw4zlnfhxje7top26u57iosg55i7dzuljjcyswo2clgc3mdliviswwyd.onion/product-category/ransomware/') # # rats @@ -234,10 +235,10 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out - if count == 10: + if count == 1: break try: @@ -254,7 +255,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling DigitalThriftShop forum done sucessfully. Press ENTER to continue\n") + print("Crawling the DigitalThriftShop market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/HiddenMarket/crawler_selenium.py b/MarketPlaces/HiddenMarket/crawler_selenium.py index 1b3e1b5..3813c76 100644 --- a/MarketPlaces/HiddenMarket/crawler_selenium.py +++ b/MarketPlaces/HiddenMarket/crawler_selenium.py @@ -29,7 +29,7 @@ baseURL = 'http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion # Opens Tor Browser, crawls the website def startCrawling(): - opentor() + # opentor() marketName = getMKTName() driver = getAccess() @@ -41,7 +41,7 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - new_parse(marketName, baseURL, False) + new_parse(marketName, baseURL, True) # Opens Tor Browser @@ -161,6 +161,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -211,27 +213,27 @@ def getInterestedLinks(): links = [] # # Civil Software - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/civil_softwares') + # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/civil_softwares') # # Tutorials - Carding - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/carding') - # # Digital - Hacks + # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/carding') + # Digital - Hacks links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/hacks') - # Digital - Exploit Kit - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/exploit_kit') + # # Digital - Exploit Kit + # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/exploit_kit') # # 0Day - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/0day') - # Digital Forensics - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/digital_forensics') - # Tutorials - Mining - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/mining') - # Tutorials - Worms - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/worms') - # Tutorials - Viruses - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/viruses') - # Tutorials - Trojans - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/trojans') - # Tutorials - Botnets - links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/botnets') + # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/0day') + # # Digital Forensics + # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/digital_forensics') + # # Tutorials - Mining + # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/mining') + # # Tutorials - Worms + # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/worms') + # # Tutorials - Viruses + # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/viruses') + # # Tutorials - Trojans + # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/trojans') + # # Tutorials - Botnets + # links.append('http://mipx6eedtsvfgfcmbm3utjedgjez2w4dzjdrbhtd2mt3cicwhhzspxqd.onion/category/botnets') return links @@ -275,11 +277,11 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out - # if count == 2: - # break + if count == 1: + break try: pageCount += 1 @@ -296,7 +298,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling HiddenMarket market done sucessfully. Press ENTER to continue\n") + print("Crawling the HiddenMarket market done.") # Returns 'True' if the link is Topic link diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 2e6ee5b..fe4ac4a 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1,8 @@ -HiddenMarket \ No newline at end of file +Apocalypse +DarkMatter +DigitalThriftShop +HiddenMarket +Nexus +Robinhood +TorBay +ViceCity \ No newline at end of file diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index ea8269a..4b9c02e 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -4,7 +4,6 @@ __author__ = 'DarkWeb' Starting point of the Darkweb Markets Mining ''' -import os from datetime import * from MarketPlaces.DarkFox.crawler_selenium import crawler as crawlerDarkFox from MarketPlaces.Tor2door.crawler_selenium import crawler as crawlerTor2door @@ -24,9 +23,11 @@ from MarketPlaces.ViceCity.crawler_selenium import crawler as crawlerViceCity from MarketPlaces.HiddenMarket.crawler_selenium import crawler as crawlerHiddenMarket from MarketPlaces.RobinhoodMarket.crawler_selenium import crawler as crawlerRobinhoodMarket from MarketPlaces.Nexus.crawler_selenium import crawler as crawlerNexus +from MarketPlaces.CypherMarketplace.crawler_selenium import crawler as crawlerCypher import configparser -import time +import os +import subprocess config = configparser.ConfigParser() config.read('../../setup.ini') @@ -71,18 +72,34 @@ def createDirectory(mkt): os.mkdir(descReadDir) +# Opens Tor Browser +def opentor(): + global pid + print("Connecting Tor...") + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) + pid = pro.pid + # time.sleep(7.5) + input('Press ENTER when Tor is connected to continue') + return + + if __name__ == '__main__': + + # opentor() + mktsList = getMarkets() for mkt in mktsList: mkt = mkt.replace('\n','') - print(f"Creating listing and description directories of {mkt} ...") + print("\nCreating listing and description directories ... for " + mkt) createDirectory(mkt) - time.sleep(5) - input("Directories created successfully. Press ENTER to continue\n") + print("Directories created.") if mkt == "DarkFox": + # for base in json["DarkFox"]["base"]: + # if crawlerDarkFox(base["url"], base["categories"]): + # break crawlerDarkFox() elif mkt == 'Tor2door': crawlerTor2door() @@ -118,7 +135,7 @@ if __name__ == '__main__': crawlerRobinhoodMarket() elif mkt == "Nexus": crawlerNexus() + elif mkt == "CypherMarketplace": + crawlerCypher() - - - print("Scraping process completed successfully!") + print("\nScraping process completed!") diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index afb85b2..c626b6a 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -20,9 +20,12 @@ from MarketPlaces.TorMarket.parser import * from MarketPlaces.HiddenMarket.parser import * from MarketPlaces.RobinhoodMarket.parser import * from MarketPlaces.Nexus.parser import * +from MarketPlaces.MikesGrandStore.parser import * from MarketPlaces.Classifier.classify_product import predict +nError = 0 + def mergePages(rmm, rec): @@ -82,13 +85,182 @@ def persist_data(url, row, cur): create_items(cur, row, marketPlace, vendor) +def incrementError(): + global nError + nError += 1 + + +def read_file(filePath, createLog, logFile): + try: + html = codecs.open(filePath.strip('\n'), encoding='utf8') + soup = BeautifulSoup(html, "html.parser") + html.close() + return soup + except: + + try: + html = open(filePath.strip('\n')) + soup = BeautifulSoup(html, "html.parser") + html.close() + return soup + except: + + incrementError() + print("There was a problem to read the file " + filePath) + if createLog: + logFile.write( + str(nError) + ". There was a problem to read the file " + filePath + "\n") + return None + + +def parse_listing(marketPlace, listingFile, soup, createLog, logFile): + try: + + if marketPlace == "DarkFox": + rw = darkfox_listing_parser(soup) + elif marketPlace == "Tor2door": + rw = tor2door_listing_parser(soup) + elif marketPlace == "Apocalypse": + rw = apocalypse_listing_parser(soup) + elif marketPlace == "ThiefWorld": + rw = thiefWorld_listing_parser(soup) + elif marketPlace == "AnonymousMarketplace": + rw = anonymousMarketplace_listing_parser(soup) + elif marketPlace == "ViceCity": + rw = vicecity_listing_parser(soup) + elif marketPlace == "TorBay": + rw = torbay_listing_parser(soup) + elif marketPlace == "M00nkeyMarket": + rw = m00nkey_listing_parser(soup) + elif marketPlace == "HiddenMarket": + rw = hiddenmarket_listing_parser(soup) + elif marketPlace == "DarkMatter": + rw = darkmatter_listing_parser(soup) + elif marketPlace == "DigitalThriftShop": + rw = digitalThriftShop_listing_parser(soup) + elif marketPlace == "LionMarketplace": + rw = lionmarketplace_listing_parser(soup) + elif marketPlace == "TorMarket": + rw = tormarket_listing_parser(soup) + elif marketPlace == "RobinhoodMarket": + rw = Robinhood_listing_parser(soup) + elif marketPlace == "Nexus": + rw = nexus_listing_parser(soup) + elif marketPlace == "MikesGrandStore": + rw = mikesGrandStore_listing_parser(soup) + else: + print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!") + raise Exception + return rw + + except: + + incrementError() + print("There was a problem to parse the file " + listingFile + " in the listing section!") + traceback.print_exc() + if createLog: + logFile.write( + str(nError) + ". There was a problem to parse the file " + listingFile + " in the Listing section.\n") + return None + + +def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): + try: + + if marketPlace == "DarkFox": + rmm = darkfox_description_parser(soup) + elif marketPlace == "Tor2door": + rmm = tor2door_description_parser(soup) + elif marketPlace == "Apocalypse": + rmm = apocalypse_description_parser(soup) + elif marketPlace == "ThiefWorld": + rmm = thiefWorld_description_parser(soup) + elif marketPlace == "AnonymousMarketplace": + rmm = anonymousMarketplace_description_parser(soup) + elif marketPlace == "ViceCity": + rmm = vicecity_description_parser(soup) + elif marketPlace == "TorBay": + rmm = torbay_description_parser(soup) + elif marketPlace == "M00nkeyMarket": + rmm = m00nkey_description_parser(soup) + elif marketPlace == "HiddenMarket": + rmm = hiddenmarket_description_parser(soup) + elif marketPlace == "DarkMatter": + rmm = darkmatter_description_parser(soup) + elif marketPlace == "DigitalThriftShop": + rmm = digitalThriftShop_description_parser(soup) + elif marketPlace == "LionMarketplace": + rmm = lionmarketplace_description_parser(soup) + elif marketPlace == "TorMarket": + rmm = tormarket_description_parser(soup) + elif marketPlace == "RobinhoodMarket": + rmm = Robinhood_description_parser(soup) + elif marketPlace == "Nexus": + rmm = nexus_description_parser(soup) + elif marketPlace == "MikesGrandStore": + rmm = mikesGrandStore_description_parser(soup) + else: + print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!") + raise Exception + return rmm + + except: + + incrementError() + print("There was a problem to parse the file " + descriptionFile + " in the Description section!") + traceback.print_exc() + if createLog: + logFile.write( + str(nError) + ". There was a problem to parse the file " + descriptionFile + " in the Description section.\n") + return None + + +def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile): + try: + persist_data(url, tuple(rec), cur) + con.commit() + return True + except: + + con.rollback() + + trace = traceback.format_exc() + + if trace.find("already exists") == -1: + incrementError() + print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!") + traceback.print_exc() + if createLog: + logFile.write( + str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n") + return False + else: + return True + + +def move_file(filePath, createLog, logFile): + # source = line2.replace(os.path.basename(line2), "") + filename + source = filePath + destination = filePath.replace(os.path.basename(filePath), "") + r'Read/' + + try: + shutil.move(source, destination) + return True + except: + + print("There was a problem to move the file " + filePath) + incrementError() + if createLog: + logFile.write( + str(nError) + ". There was a problem to move the file " + filePath + "\n") + return False + + def new_parse(marketPlace, url, createLog): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - print("Parsing the " + marketPlace + " marketplace and conduct data classification to store the information in the database.") - - # ini = time.time() + print("Parsing the " + marketPlace + " market and conduct data classification to store the information in the database.") # Connecting to the database con = connectDataBase() @@ -97,271 +269,131 @@ def new_parse(marketPlace, url, createLog): # Creating the tables (The database should be created manually) create_database(cur, con) - nError = 0 - - lines = [] # listing pages - lns = [] # description pages - detPage = {} + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + marketPlace + "/HTML_Pages") - #Creating the log file for each Market Place + # Creating the log file for each Forum if createLog: - if not os.path.exists("./" + marketPlace + "/Logs/" + marketPlace + "_" + CURRENT_DATE + ".log"): - logFile = open("./" + marketPlace + "/Logs/" + marketPlace + "_" + CURRENT_DATE + ".log", "w") - else: - print("Files of the date " + CURRENT_DATE + " from the Market Place " + marketPlace + - " were already read. Delete the referent information in the Data Base and also delete the log file" - " in the _Logs folder to read files from this Market Place of this date again.") - raise SystemExit - - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + marketPlace + "/HTML_Pages") + try: + logFile = open(mainDir + f"/{CURRENT_DATE}/" + marketPlace + "_" + CURRENT_DATE + ".log", "w") + except: + print("Could not open log file!") + createLog = False + logFile = None + # raise SystemExit + else: + logFile = None # Reading the Listing Html Pages - for fileListing in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')): - lines.append(fileListing) + listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) + for listingIndex, listingFile in enumerate(listings): - # Reading the Description Html Pages - for fileDescription in glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", '*.html')): - lns.append(fileDescription) + print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(listingFile) + "', index= " + str( + listingIndex + 1) + " ... " + str(len(listings))) - # Parsing the Description Pages and put the tag's content into a dictionary (Hash table) - for index, line2 in enumerate(lns): + listingSoup = read_file(listingFile, createLog, logFile) - print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename(line2) + "', index= " + str(index + 1) + " ... " + str(len(lns))) + # listing flags + doParseListing = listingSoup is not None + doDescription = False - try: - html = codecs.open(line2.strip('\n'), encoding='utf8') - soup = BeautifulSoup(html, "html.parser") - html.close() - except: + readDescriptionError = False + parseDescriptionError = False + persistDescriptionError = False + moveDescriptionError = False + findDescriptionError = False - try: - html = open(line2.strip('\n')) - soup = BeautifulSoup(html, "html.parser") - html.close() - except: + rw = [] - nError += 1 - print("There was a problem to read the file " + line2 + " in the Description section!") - if createLog: - logFile.write(str(nError) + ". There was a problem to read the file " + line2 + " in the Description section.\n") - continue + if doParseListing: - try: + rw = parse_listing(marketPlace, listingFile, listingSoup, createLog, logFile) - if marketPlace == "DarkFox": - rmm = darkfox_description_parser(soup) - elif marketPlace == "Tor2door": - rmm = tor2door_description_parser(soup) - elif marketPlace == "Apocalypse": - rmm = apocalypse_description_parser(soup) - elif marketPlace == "ThiefWorld": - rmm = thiefWorld_description_parser(soup) - elif marketPlace =="AnonymousMarketplace": - rmm = anonymousMarketplace_description_parser(soup) - elif marketPlace == "ViceCity": - rmm = vicecity_description_parser(soup) - elif marketPlace == "TorBay": - rmm = torbay_description_parser(soup) - elif marketPlace == "M00nkeyMarket": - rmm = m00nkey_description_parser(soup) - elif marketPlace == "HiddenMarket": - rmm = hiddenmarket_description_parser(soup) - elif marketPlace == "DarkMatter": - rmm = darkmatter_description_parser(soup) - elif marketPlace == "DigitalThriftShop": - rmm = digitalThriftShop_description_parser(soup) - elif marketPlace == "LionMarketplace": - rmm = lionmarketplace_description_parser(soup) - elif marketPlace == "TorMarket": - rmm = tormarket_description_parser(soup) - elif marketPlace == "RobinhoodMarket": - rmm = Robinhood_description_parser(soup) - elif marketPlace == "Nexus": - rmm = nexus_description_parser(soup) - # key = u"Pr:" + rmm[0].upper()[:desc_lim1] + u" Vendor:" + rmm[13].upper()[:desc_lim2] - key = u"Url:" + os.path.basename(line2).replace(".html", "") - - # save file address with description record in memory - detPage[key] = {'rmm': rmm, 'filename': os.path.basename(line2)} - - except Exception as e: - raise e - - nError += 1 - print("There was a problem to parse the file " + line2 + " in the Description section!") - if createLog: - logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n") + doDescription = rw is not None - # Parsing the Listing Pages and put the tag's content into a list - for index, line1 in enumerate(lines): + if doDescription: - print("Reading listing folder of '" + marketPlace + "', file '" + os.path.basename(line1) + "', index= " + str(index + 1) + " ... " + str(len(lines))) + nFound = 0 - readError = False - try: - html = codecs.open(line1.strip('\n'), encoding='utf8') - soup = BeautifulSoup(html, "html.parser") - html.close() - except: + for rec in rw: - try: - html = open(line1.strip('\n')) - soup = BeautifulSoup(html, "html.parser") - html.close() - except Exception as e: - raise e - nError += 1 - print("There was a problem to read the file " + line1 + " in the Listing section!") - if createLog: - logFile.write(str(nError) + ". There was a problem to read the file " + line1 + " in the Listing section.\n") - readError = True - - if not readError: - - parseError = False - try: - - if marketPlace == "DarkFox": - rw = darkfox_listing_parser(soup) - elif marketPlace == "Tor2door": - rw = tor2door_listing_parser(soup) - elif marketPlace == "Apocalypse": - rw = apocalypse_listing_parser(soup) - elif marketPlace == "ThiefWorld": - rw = thiefWorld_listing_parser(soup) - elif marketPlace == "AnonymousMarketplace": - rw = anonymousMarketplace_listing_parser(soup) - elif marketPlace == "ViceCity": - rw = vicecity_listing_parser(soup) - elif marketPlace == "TorBay": - rw = torbay_listing_parser(soup) - elif marketPlace == "M00nkeyMarket": - rw = m00nkey_listing_parser(soup) - elif marketPlace == "HiddenMarket": - rw =hiddenmarket_listing_parser(soup) - elif marketPlace == "DarkMatter": - rw = darkmatter_listing_parser(soup) - elif marketPlace == "DigitalThriftShop": - rw = digitalThriftShop_listing_parser(soup) - elif marketPlace == "LionMarketplace": - rw = lionmarketplace_listing_parser(soup) - elif marketPlace == "TorMarket": - rw = tormarket_listing_parser(soup) - elif marketPlace == "RobinhoodMarket": - rw = Robinhood_listing_parser(soup) - elif marketPlace == "Nexus": - rw = nexus_listing_parser(soup) - else: - parseError = True - - except Exception as e: - - nError += 1 - print("There was a problem to parse the file " + line1 + " in the listing section!") - if createLog: - logFile.write( - str(nError) + ". There was a problem to parse the file " + line1 + " in the Listing section.\n") - parseError = True + rec = rec.split(',') - if not parseError: + descriptionPattern = cleanLink(rec[20]) + ".html" - persistError = False - moveError = False - num_in_db = 0 - num_persisted_moved = 0 + # Reading the associated description Html Pages + descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern)) - for rec in rw: + nFound += len(descriptions) - rec = rec.split(',') + for descriptionIndex, descriptionFile in enumerate(descriptions): - # if len(detPage) > 0: #It was created here just because Zeroday Market does not have Description Pages - # key = rec[23] + print("Reading description folder of '" + marketPlace + "', file '" + os.path.basename( + descriptionFile) + "', index= " + str(descriptionIndex + 1) + " ... " + str(len(descriptions))) - # key = u"Pr:" + rec[1].upper()[:list_lim1] + u" Vendor:" + rec[18].upper()[:list_lim2] - key = u"Url:" + cleanLink(rec[20]) + descriptionSoup = read_file(descriptionFile, createLog, logFile) - # if the associated description page is parsed - if key in detPage: + # description flags + doParseDescription = descriptionSoup is not None + doPersistRecord = False + doMoveDescription = False - # rec = mergePages(detPage, rec) + rmm = [] + + if doParseDescription: + + rmm = parse_description(marketPlace, descriptionFile, descriptionSoup, createLog, logFile) + + doPersistRecord = rmm is not None + + else: + readDescriptionError = True + parseDescriptionError = True + + if doPersistRecord: # Combining the information from Listing and Description Pages - rmm = detPage[key]['rmm'] rec = mergePages(rmm, rec) - # Append to the list the classification of the product - # rec.append(str(predict(rec[1], rec[5], language='markets'))) + # Append to the list the classification of the topic rec.append(str(predict(rec[4], rec[5], language='sup_english'))) # Persisting the information in the database - try: - persist_data(url, tuple(rec), cur) - con.commit() - except Exception as e: - - trace = traceback.format_exc() - - if trace.find("already exists") == -1: - nError += 1 - print("There was a problem to persist the file " + detPage[key]['filename'] + " in the database!") - if createLog: - logFile.write( - str(nError) + ". There was a problem to persist the file " + detPage[key]['filename'] + " in the database.\n") - persistError = True - - con.rollback() - - if not persistError: - - # move description files of completed folder - source = line2.replace(os.path.basename(line2), "") + detPage[key]['filename'] - destination = line2.replace(os.path.basename(line2), "") + r'Read/' - - try: - shutil.move(source, destination) - num_persisted_moved += 1 - except: - - print("There was a problem to move the file " + detPage[key]['filename'] + " in the Description section!") - nError += 1 - if createLog: - logFile.write( - str(nError) + ". There was a problem to move the file " + detPage[key]['filename'] + " in the Description section!.\n") - moveError = True - - # if the associated description page is not read or not parsed + persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, + descriptionFile) + + doMoveDescription = persistSuccess + else: - # query database - # if the product already exists: - # num_in_db += 1 - pass + parseDescriptionError = True - # if number of products on listing page is equal to - # the number of merged, persisted, and moved products plus - # the number of products already in the database - if not persistError and not moveError and len(rw) == (num_persisted_moved + num_in_db): + if doMoveDescription: - # move listing file to completed folder - source = line1 - destination = line1.replace(os.path.basename(line1), "") + r'Read/' + # move description files of completed folder + moveSuccess = move_file(descriptionFile, createLog, logFile) - try: - shutil.move(source, destination) - except: + if not moveSuccess: + moveDescriptionError = True - nError += 1 - print("There was a problem to move the file " + line1 + " in the Listing section!") - if createLog: - logFile.write(str(nError) + ". There was a problem to move the file " + line1 + " in the Listing section!.\n") + else: + moveDescriptionError = True - # g.close () + if not (nFound > 0): - if createLog: - logFile.close() + findDescriptionError = True + + incrementError() + print(f"There was a problem to locate the file(s) for {listingFile} in the Description section!") + if createLog: + logFile.write( + str(nError) + f". There was a problem to locate the file(s) for {listingFile}" + f" in the Description section!\n") - # end = time.time() + if not (readDescriptionError or parseDescriptionError or persistDescriptionError + or moveDescriptionError or findDescriptionError): + # move listing files of completed folder + move_file(listingFile, createLog, logFile) - # finalTime = float(end-ini) + if createLog: + logFile.close() - # print (marketPlace + " Parsing Perfomed Succesfully in %.2f" %finalTime + "!") - input("Parsing the " + marketPlace + " marketplace and data classification done successfully. Press ENTER to continue\n") + print("Parsing the " + marketPlace + " market and data classification done.") diff --git a/MarketPlaces/LionMarketplace/crawler_selenium.py b/MarketPlaces/LionMarketplace/crawler_selenium.py index 7558452..237838f 100644 --- a/MarketPlaces/LionMarketplace/crawler_selenium.py +++ b/MarketPlaces/LionMarketplace/crawler_selenium.py @@ -31,19 +31,19 @@ baseURL = 'http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() + # opentor() mktName = getMKTName() driver = getAccess() if driver != 'down': try: - login(driver) + # login(driver) crawlForum(driver) except Exception as e: print(driver.current_url, e) closetor(driver) - new_parse(mktName, baseURL, False) + new_parse(mktName, baseURL, True) # Opens Tor Browser @@ -103,7 +103,7 @@ def createFFDriver(): ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 2) + ff_prof.set_preference("permissions.default.image", 1) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") @@ -119,6 +119,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -187,12 +189,12 @@ def getInterestedLinks(): # Software/Malware links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/16') - # Carding - links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/20') - # Hacking - links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/ba142ac0-c7e7-11ec-9bd1-fdd89c3d3f91') - # tutorial - links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/19') + # # Carding + # links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/20') + # # Hacking + # links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/ba142ac0-c7e7-11ec-9bd1-fdd89c3d3f91') + # # tutorial + # links.append('http://lionznqc2hg2wsp5vgruqait4cpknihwlje6hkjyi52lcl5ivyf7bcad.onion/category/19') return links @@ -231,12 +233,12 @@ def crawlForum(driver): savePage(driver, driver.page_source, item) driver.back() - # # comment out - # break - # - # # comment out - # if count == 1: - # break + # comment out + break + + # comment out + if count == 1: + break try: link = driver.find_element(by=By.XPATH, value= @@ -252,7 +254,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling LionMarketplace forum done sucessfully. Press ENTER to continue\n") + print("Crawling the LionMarketplace market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/M00nkeyMarket/crawler_selenium.py b/MarketPlaces/M00nkeyMarket/crawler_selenium.py index 83413fc..ccd8f11 100644 --- a/MarketPlaces/M00nkeyMarket/crawler_selenium.py +++ b/MarketPlaces/M00nkeyMarket/crawler_selenium.py @@ -34,17 +34,17 @@ MARKET_NAME = 'M00nkeyMarket' #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): # opentor() - # driver = getAccess() - # - # if driver != 'down': - # try: - # login(driver) - # crawlForum(driver) - # except Exception as e: - # print(driver.current_url, e) - # closetor(driver) + driver = getAccess() - new_parse(MARKET_NAME, BASE_URL, False) + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) + + new_parse(MARKET_NAME, BASE_URL, True) # Opens Tor Browser @@ -120,6 +120,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -203,7 +205,7 @@ def getInterestedLinks(): # software links.append('http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/search/subcategories?subcategory=30') # # guides - links.append('http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/search/subcategories?subcategory=17') + # links.append('http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/search/subcategories?subcategory=17') return links @@ -243,11 +245,11 @@ def crawlForum(driver): driver.back() # comment out - # break + break # comment out - # if count == 1: - # break + if count == 1: + break try: link = driver.find_element(by=By.LINK_TEXT, value='Next ›').get_attribute('href') @@ -262,7 +264,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling M00nkeyMarket done sucessfully. Press ENTER to continue\n") + print("Crawling the M00nkeyMarket done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/MikesGrandStore/crawler_selenium.py b/MarketPlaces/MikesGrandStore/crawler_selenium.py index b6b67ac..cd45464 100644 --- a/MarketPlaces/MikesGrandStore/crawler_selenium.py +++ b/MarketPlaces/MikesGrandStore/crawler_selenium.py @@ -31,47 +31,19 @@ baseURL = 'http://4yx2akutmkhwfgzlpdxiah7cknurw6vlddlq24fxa3r3ebophwgpvhyd.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - print("Welcome to the darkweb pipeline. Do you want to run:") - print("[A] Entire pipeline\t[B] Crawler only\t[C] Parser only") - choice = input() - - while choice not in {'A', 'B', 'C'}: - print("Choose the options below only!") - print("[A] Entire pipeline\t[B] Crawler only\t[C] Parser only") - choice = input() - - if choice == 'A': - opentor() - mktName = getMKTName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) - - new_parse(mktName, baseURL, False) - - - if choice == 'B': - opentor() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) - - - if choice == 'C': - mktName = getMKTName() - new_parse(mktName, baseURL, False) + # opentor() + mktName = getMKTName() + driver = getAccess() + + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) + + new_parse(mktName, baseURL, True) # Opens Tor Browser @@ -131,7 +103,7 @@ def createFFDriver(): ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 2) + ff_prof.set_preference("permissions.default.image", 1) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") @@ -147,6 +119,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -275,7 +249,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling MikesGrandStore forum done sucessfully. Press ENTER to continue\n") + print("Crawling the MikesGrandStore market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/Nexus/crawler_selenium.py b/MarketPlaces/Nexus/crawler_selenium.py index 4ae2a21..70e1480 100644 --- a/MarketPlaces/Nexus/crawler_selenium.py +++ b/MarketPlaces/Nexus/crawler_selenium.py @@ -31,7 +31,7 @@ baseURL = 'http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() + # opentor() mktName = getMKTName() driver = getAccess() @@ -40,9 +40,9 @@ def startCrawling(): crawlForum(driver) except Exception as e: print(driver.current_url, e) - closetor(driver) + closetor(driver) - new_parse(mktName, baseURL, False) + new_parse(mktName, baseURL, True) # Opens Tor Browser #prompts for ENTER input to continue @@ -116,6 +116,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -131,8 +133,8 @@ def getAccess(): driver.close() return 'down' -def savePage(page, url): - cleanPage = cleanHTML(page) +def savePage(driver, page, url): + cleanPage = cleanHTML(driver, page) filePath = getFullPathName(url) os.makedirs(os.path.dirname(filePath), exist_ok=True) open(filePath, 'wb').write(cleanPage.encode('utf-8')) @@ -173,14 +175,14 @@ def getInterestedLinks(): # Bot nets links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/botnets/') - # Rats - links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/rats/') - # Ransomware - links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/ransomware/') - # Other Malware - links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/outros-malware/') - # Hacking Tools & Scripting - links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/ferramentas-de-hacking-scripts/') + # # Rats + # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/rats/') + # # Ransomware + # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/ransomware/') + # # Other Malware + # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/malware/outros-malware/') + # # Hacking Tools & Scripting + # links.append('http://nexus2bmba34euohk3xo7og2zelkgbtc2p7rjsbxrjjknlecja2tdvyd.onion/categoria-produto/hacking-spam/ferramentas-de-hacking-scripts/') return links @@ -207,7 +209,7 @@ def crawlForum(driver): except: driver.refresh() html = driver.page_source - savePage(html, link) + savePage(driver, html, link) list = productPages(html) for item in list: @@ -216,9 +218,16 @@ def crawlForum(driver): driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver, driver.page_source, item) driver.back() + # comment out + break + + # comment out + if count == 1: + break + try: link = driver.find_element(by=By.XPATH, value= '/html/body/div[1]/div[2]/div/div/main/nav/ul/li[3]/a').get_attribute('href') @@ -233,7 +242,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Nexus done sucessfully. Press ENTER to continue\n") + print("Crawling the Nexus market done.") # Returns 'True' if the link is a description link @@ -263,5 +272,5 @@ def productPages(html): def crawler(): startCrawling() - print("Crawling and Parsing Nexus .... DONE!") + # print("Crawling and Parsing Nexus .... DONE!") diff --git a/MarketPlaces/Nexus/parser.py b/MarketPlaces/Nexus/parser.py index 1b28984..3c0cfb6 100644 --- a/MarketPlaces/Nexus/parser.py +++ b/MarketPlaces/Nexus/parser.py @@ -107,7 +107,7 @@ def nexus_listing_parser(soup): # Finding the name of the product name_of_product = product.find("h2", {"class": "woocommerce-loop-product__title"}).find("a").text name_of_product_cleaned = cleanString(name_of_product.strip()) - print(name_of_product_cleaned) + # print(name_of_product_cleaned) name.append(name_of_product_cleaned) #finding the URL try: @@ -135,8 +135,8 @@ def nexus_listing_parser(soup): qLeft.append("-1") shipFrom.append("-1") shipTo.append("-1") - print("Done! moving onto the next product!") - print(len(shipTo)) + # print("Done! moving onto the next product!") + # print(len(shipTo)) nm += 1 except AttributeError as e: print("I'm somewhere I don't belong. I'm going to leave") diff --git a/MarketPlaces/RobinhoodMarket/crawler_selenium.py b/MarketPlaces/RobinhoodMarket/crawler_selenium.py index ab22f78..9124a8f 100644 --- a/MarketPlaces/RobinhoodMarket/crawler_selenium.py +++ b/MarketPlaces/RobinhoodMarket/crawler_selenium.py @@ -1,7 +1,7 @@ __author__ = 'chris' ''' -WeTheNorth Market Crawler (Selenium) +RobinhoodMarket Market Crawler (Selenium) ''' from selenium import webdriver @@ -23,8 +23,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.RobinhoodMarket.parser import Robinhood_links_parser from MarketPlaces.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/' @@ -34,15 +32,14 @@ def startCrawling(): # Opening tor beforehand gives "Tor exited during startup error" # opentor() - marketName = getMarketName() + marketName = getMKTName() driver = getAccess() - # Captcha - input("Press ENTER when website has loaded") - if driver != 'down': try: + # Captcha + input("Press ENTER when website has loaded") # Robinhood doesn't need login # login(driver) crawlForum(driver) @@ -50,11 +47,13 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - new_parse(marketName, baseURL, False) + new_parse(marketName, baseURL, True) # Opens Tor Browser def opentor(): + from MarketPlaces.Initialization.markets_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -70,7 +69,7 @@ def login(driver): # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'RobinhoodMarket' return name @@ -96,6 +95,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -124,13 +125,14 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver def getAccess(): url = getFixedURL() driver = createFFDriver() - input('Tor Connected. Press ENTER to continue\n') try: driver.get(url) return driver @@ -150,12 +152,14 @@ def savePage(driver, page, url): # Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): - from MarketPlaces.Initialization.markets_mining import CURRENT_DATE + from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE + + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\RobinhoodMarket\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') else: - fullPath = r'..\RobinhoodMarket\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' + fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html') return fullPath @@ -174,8 +178,8 @@ def getInterestedLinks(): # Hacking links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/hacking/') - # Other Software - links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/') + # # Other Software + # links.append('http://ilr3qzubfnx33vbhal7l5coo4ftqlkv2tboph4ujog5crz6m5ua2b2ad.onion/product-category/other-software/') return links @@ -184,25 +188,24 @@ def crawlForum(driver): print("Crawling the Robinhood market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(driver, html, link) + list = productPages(html) for item in list: @@ -213,27 +216,20 @@ def crawlForum(driver): driver.refresh() savePage(driver, driver.page_source, item) driver.back() + # comment out - # break + break # comment out - # if count == 1: - # count = 0 - # break + if count == 1: + break # go to next page of market try: nav = driver.find_element(by=By.XPATH, value="//a[@class='next page-numbers']") link = nav.get_attribute('href') - if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) count += 1 except NoSuchElementException: @@ -243,10 +239,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - input("Crawling Robinhood market done successfully. Press ENTER to continue\n") + print("Crawling the Robinhood market done.") # Returns 'True' if the link is Topic link diff --git a/MarketPlaces/ThiefWorld/crawler_selenium.py b/MarketPlaces/ThiefWorld/crawler_selenium.py index 194f449..345bdbe 100644 --- a/MarketPlaces/ThiefWorld/crawler_selenium.py +++ b/MarketPlaces/ThiefWorld/crawler_selenium.py @@ -1,7 +1,7 @@ __author__ = 'Helium' ''' -ThiefWorld Forum Crawler (Selenium) +ThiefWorld Market Crawler (Selenium) ''' from selenium import webdriver @@ -32,7 +32,7 @@ baseURL = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() + # opentor() mktName = getMKTName() driver = getAccess() @@ -44,7 +44,7 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - # new_parse(mktName, baseURL, False) + new_parse(mktName, baseURL, True) # Opens Tor Browser @@ -104,7 +104,7 @@ def createFFDriver(): ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 2) + ff_prof.set_preference("permissions.default.image", 1) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") @@ -120,6 +120,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -260,7 +262,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling ThiefWorld forum done sucessfully. Press ENTER to continue\n") + print("Crawling the ThiefWorld market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/Tor2door/captcha.png b/MarketPlaces/Tor2door/captcha.png deleted file mode 100644 index 39bd6f2..0000000 Binary files a/MarketPlaces/Tor2door/captcha.png and /dev/null differ diff --git a/MarketPlaces/Tor2door/crawler_selenium.py b/MarketPlaces/Tor2door/crawler_selenium.py index 858ddcf..ec2e37f 100644 --- a/MarketPlaces/Tor2door/crawler_selenium.py +++ b/MarketPlaces/Tor2door/crawler_selenium.py @@ -29,8 +29,8 @@ baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion # Opens Tor Browser, crawls the website def startCrawling(): - opentor() - # marketName = getMKTName() + # opentor() + marketName = getMKTName() driver = getAccess() if driver != 'down': @@ -41,7 +41,7 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - # new_parse(marketName, baseURL, False) + new_parse(marketName, baseURL, True) # Opens Tor Browser @@ -161,6 +161,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -278,7 +280,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling Tor2door market done sucessfully. Press ENTER to continue\n") + print("Crawling the Tor2door market done.") # Returns 'True' if the link is Topic link diff --git a/MarketPlaces/TorBay/crawler_selenium.py b/MarketPlaces/TorBay/crawler_selenium.py index 7968089..5035999 100644 --- a/MarketPlaces/TorBay/crawler_selenium.py +++ b/MarketPlaces/TorBay/crawler_selenium.py @@ -34,17 +34,17 @@ baseURL = 'http://torbay3253zck4ym5cbowwvrbfjjzruzthrx3np5y6owvifrnhy5ybid.onion def startCrawling(): # opentor() mktName = getMKTName() - # driver = getAccess() - # - # if driver != 'down': - # try: - # login(driver) - # crawlForum(driver) - # except Exception as e: - # print(driver.current_url, e) - # closetor(driver) - # - new_parse(mktName, baseURL, False) + driver = getAccess() + + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) + + new_parse(mktName, baseURL, True) # Opens Tor Browser @@ -120,6 +120,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' @@ -230,7 +232,7 @@ def crawlForum(driver): # comment out if count == 1: - break + break try: link = driver.find_element(by=By.XPATH, value= @@ -246,7 +248,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling TorBay forum done sucessfully. Press ENTER to continue\n") + print("Crawling the TorBay market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/TorMarket/crawler_selenium.py b/MarketPlaces/TorMarket/crawler_selenium.py index 7569045..b76fb1c 100644 --- a/MarketPlaces/TorMarket/crawler_selenium.py +++ b/MarketPlaces/TorMarket/crawler_selenium.py @@ -33,17 +33,17 @@ baseURL = 'http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion def startCrawling(): # opentor() mktName = getMKTName() - # driver = getAccess() - # - # if driver != 'down': - # try: - # login(driver) - # crawlForum(driver) - # except Exception as e: - # print(driver.current_url, e) - # closetor(driver) + driver = getAccess() - new_parse(mktName, baseURL, False) + if driver != 'down': + try: + # login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) + + new_parse(mktName, baseURL, True) # Opens Tor Browser @@ -103,7 +103,7 @@ def createFFDriver(): ff_prof.set_preference("network.cookie.lifetimePolicy", 2) ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.http.sendRefererHeader", 0) - ff_prof.set_preference("permissions.default.image", 2) + ff_prof.set_preference("permissions.default.image", 1) ff_prof.set_preference("browser.download.folderList", 2) ff_prof.set_preference("browser.download.manager.showWhenStarting", False) ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain") @@ -119,6 +119,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver @@ -184,12 +186,12 @@ def getNameFromURL(url): def getInterestedLinks(): links = [] - # Hacking Tutorials - links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/hacking/') - # # Malware + # # Hacking Tutorials + # links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/guides-tutorials/hacking/') + # Malware links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/malware/') # # Hacking Services - links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/hacking-services/') + # links.append('http://22222253ebafysmwyrl4uxfcs2xm6k7zb4xyse2csne73atvxu53gfad.onion/product-category/services/hacking-services/') return links @@ -232,8 +234,8 @@ def crawlForum(driver): break # comment out - # if count == 1: - # break + if count == 1: + break try: link = driver.find_element(by=By.XPATH, value= @@ -249,7 +251,7 @@ def crawlForum(driver): print(link, e) i += 1 - input("Crawling TorMarket forum done sucessfully. Press ENTER to continue\n") + print("Crawling the TorMarket market done.") # Returns 'True' if the link is a description link diff --git a/MarketPlaces/ViceCity/crawler_selenium.py b/MarketPlaces/ViceCity/crawler_selenium.py index 91b08cd..cf7ea82 100644 --- a/MarketPlaces/ViceCity/crawler_selenium.py +++ b/MarketPlaces/ViceCity/crawler_selenium.py @@ -25,8 +25,6 @@ from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.ViceCity.parser import vicecity_links_parser from MarketPlaces.Utilities.utilities import cleanHTML -config = configparser.ConfigParser() -config.read('../../setup.ini') counter = 1 baseURL = 'http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/' @@ -34,7 +32,7 @@ baseURL = 'http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - opentor() + # opentor() mktName = getMKTName() driver = getAccess() @@ -46,12 +44,14 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - new_parse(mktName, baseURL, False) + new_parse(mktName, baseURL, True) # Opens Tor Browser #prompts for ENTER input to continue def opentor(): + from MarketPlaces.Initialization.markets_mining import config + global pid print("Connecting Tor...") pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) @@ -90,6 +90,8 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): + from MarketPlaces.Initialization.markets_mining import config + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) @@ -118,6 +120,8 @@ def createFFDriver(): driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) + driver.maximize_window() + return driver #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' @@ -140,9 +144,9 @@ def login(driver): # wait for first captcha page to show up (This Xpath may need to change based on different seed url) WebDriverWait(driver, 100).until(EC.visibility_of_element_located( (By.XPATH, "/html/body/div/div/form/div/div[1]"))) - input("Press Enter once captcha done (dont press done)") + input("Press Enter once captcha done") #clicks button after captcha is inputted - driver.find_element(by=By.XPATH, value='/html/body/div/div/form/button').click() + # driver.find_element(by=By.XPATH, value='/html/body/div/div/form/button').click() #wait for login page to show up WebDriverWait(driver, 100).until(EC.visibility_of_element_located( @@ -152,9 +156,9 @@ def login(driver): userBox.send_keys('ct1234') #waits for second catpcha to be inputted by user - input("Press Enter once captcha done (dont press continue)") + input("Press Enter once captcha done") #clicks on continue - driver.find_element(by=By.XPATH, value='/html/body/div/div/div/form/input[2]').click() + # driver.find_element(by=By.XPATH, value='/html/body/div/div/div/form/input[2]').click() #waits for password to show WebDriverWait(driver, 100).until(EC.visibility_of_element_located( @@ -220,12 +224,12 @@ def getInterestedLinks(): # Digital - Fraud Software, Has Hacking and Guides links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=150') - # Digital - Guides and Tutorials - links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=94') - # Carding Services - links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=155') - # Digital - Other (half junk half random stuff like: bots, rats, viruses, and guides) - links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=153') + # # Digital - Guides and Tutorials + # links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=94') + # # Carding Services + # links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=155') + # # Digital - Other (half junk half random stuff like: bots, rats, viruses, and guides) + # links.append('http://52qlucglu6fuaqist2herssakipapig2higaaayu7446n55xw4ylxqid.onion/?category=153') return links @@ -237,26 +241,24 @@ def crawlForum(driver): print("Crawling the ViceCity Market") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() - count = 0 i = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) - has_next_page = True + count = 0 + while has_next_page: + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(driver, html, link) + list = productPages(html) - j = 0 for item in list: itemURL = urlparse.urljoin(baseURL, str(item)) try: @@ -268,25 +270,18 @@ def crawlForum(driver): time.sleep(2.5) # so site doesnt crash driver.back() - #comment out - # break + # comment out + break - # # comment out - # if count == 1: - # count = 0 - # break + # comment out + if count == 1: + break try: temp = driver.find_element(by=By.CLASS_NAME, value='pagination') link = temp.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(driver, html, link) count += 1 except NoSuchElementException: @@ -296,10 +291,7 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - - input("Crawling ViceCity done sucessfully. Press ENTER to continue\n") + print("Crawling the ViceCity market done.") # Returns 'True' if the link is a description link diff --git a/setup.ini b/setup.ini index 41b32d0..f4c18df 100644 --- a/setup.ini +++ b/setup.ini @@ -15,4 +15,4 @@ password = password database = darkweb_markets_forums [Encryption] -secret = "password" \ No newline at end of file +secret = password \ No newline at end of file