From d30c8066e307536b5e951ec07a15f08833074d5e Mon Sep 17 00:00:00 2001 From: westernmeadow Date: Tue, 20 Jun 2023 02:14:23 -0700 Subject: [PATCH] added setup.ini, global date, and persisting urls --- .idea/DW_Pipeline_Test.iml | 4 +- .idea/misc.xml | 2 +- Forums/CryptBB/crawler_selenium.py | 66 +++++----- Forums/CryptBB/parser.py | 57 ++++----- Forums/DB_Connection/db_connection.py | 16 ++- Forums/Initialization/forums_mining.py | 17 ++- Forums/Initialization/geckodriver.log | 119 ++++++++++++++++++ Forums/Initialization/prepare_parser.py | 41 +++--- Forums/Utilities/utilities.py | 2 +- MarketPlaces/DB_Connection/db_connection.py | 16 ++- MarketPlaces/Initialization/geckodriver.log | 32 +++++ MarketPlaces/Initialization/marketsList.txt | 2 +- MarketPlaces/Initialization/markets_mining.py | 6 +- MarketPlaces/Initialization/prepare_parser.py | 41 +++--- MarketPlaces/ThiefWorld/crawler_selenium.py | 55 ++++---- MarketPlaces/Tor2door/crawler_selenium.py | 55 ++++---- MarketPlaces/Tor2door/parser.py | 49 ++++---- path.txt | 3 - setup.ini | 14 +++ 19 files changed, 370 insertions(+), 227 deletions(-) delete mode 100644 path.txt create mode 100644 setup.ini diff --git a/.idea/DW_Pipeline_Test.iml b/.idea/DW_Pipeline_Test.iml index 71f5e9b..11bc817 100644 --- a/.idea/DW_Pipeline_Test.iml +++ b/.idea/DW_Pipeline_Test.iml @@ -2,7 +2,7 @@ - + @@ -12,6 +12,8 @@ diff --git a/.idea/misc.xml b/.idea/misc.xml index baf04e9..61a3499 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,4 @@ - + \ No newline at end of file diff --git a/Forums/CryptBB/crawler_selenium.py b/Forums/CryptBB/crawler_selenium.py index 44db724..655f39a 100644 --- a/Forums/CryptBB/crawler_selenium.py +++ b/Forums/CryptBB/crawler_selenium.py @@ -12,17 +12,19 @@ from selenium.webdriver.firefox.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait -from PIL import Image +from PIL import Image import urllib.parse as urlparse import os, re, time -from datetime import date import subprocess +import configparser from bs4 import BeautifulSoup from Forums.Initialization.prepare_parser import new_parse from Forums.CryptBB.parser import cryptBB_links_parser from Forums.Utilities.utilities import cleanHTML +config = configparser.ConfigParser() +config.read('../../setup.ini') counter = 1 baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/' @@ -41,15 +43,14 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - # new_parse(forumName, False) + # new_parse(forumName, baseURL, False) # Opens Tor Browser def opentor(): global pid print("Connecting Tor...") - path = open('../../path.txt').readline().strip() - pro = subprocess.Popen(path) + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) pid = pro.pid time.sleep(7.5) input('Tor Connected. Press ENTER to continue\n') @@ -132,12 +133,9 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): - file = open('../../path.txt', 'r') - lines = file.readlines() - - ff_binary = FirefoxBinary(lines[0].strip()) + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - ff_prof = FirefoxProfile(lines[1].strip()) + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof.set_preference("places.history.enabled", False) ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) @@ -145,7 +143,7 @@ def createFFDriver(): ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True) ff_prof.set_preference("signon.rememberSignons", False) ff_prof.set_preference("network.cookie.lifetimePolicy", 2) - ff_prof.set_preference("network.dns.disablePrefetch", True)# + ff_prof.set_preference("network.dns.disablePrefetch", True) ff_prof.set_preference("network.http.sendRefererHeader", 0) ff_prof.set_preference("permissions.default.image", 3) ff_prof.set_preference("browser.download.folderList", 2) @@ -159,7 +157,7 @@ def createFFDriver(): ff_prof.set_preference("javascript.enabled", True) ff_prof.update_preferences() - service = Service(lines[2].strip()) + service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) @@ -170,10 +168,10 @@ def getAccess(): url = getFixedURL() driver = createFFDriver() try: - driver.get(url)# open url in browser + driver.get(url) return driver except: - driver.close()# close tab + driver.close() return 'down' @@ -188,15 +186,12 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): + from Forums.Initialization.forums_mining import CURRENT_DATE fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\\CryptBB\\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + fullPath = r'..\\CryptBB\\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' else: - fullPath = r'..\\CryptBB\\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + fullPath = r'..\\CryptBB\\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' return fullPath @@ -204,7 +199,7 @@ def getFullPathName(url): def getNameFromURL(url): global counter name = ''.join(e for e in url if e.isalnum()) - if (name == ''): + if name == '': name = str(counter) counter = counter + 1 return name @@ -226,7 +221,7 @@ def getInterestedLinks(): # # Training Challenges # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=96') # Darknet Discussions - #links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88') + # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=88') # # Public Leaks and Warez # links.append('http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=97') # # Hacked Accounts and Database Dumps @@ -251,7 +246,7 @@ def crawlForum(driver): print('Crawling :', link) try: try: - driver.get(link)# open + driver.get(link) except: driver.refresh() html = driver.page_source @@ -259,10 +254,17 @@ def crawlForum(driver): has_next_page = True - #loop through the topics while has_next_page: - list = topicPages(html)# for multiple pages + list = topicPages(html) for item in list: + itemURL = urlparse.urljoin(baseURL, str(item)) + try: + driver.get(itemURL) + except: + driver.refresh() + savePage(driver.page_source, item) + driver.back() + ''' #variable to check if there is a next page for the topic has_next_topic_page = True counter = 1 @@ -291,18 +293,19 @@ def crawlForum(driver): except NoSuchElementException: has_next_topic_page = False - #end of loop + # end of loop for i in range(counter): driver.back() + ''' # comment out - #break + break # comment out - #if count == 1: - # count = 0 - # break + if count == 1: + count = 0 + break - try:# change depending on web page, #next page + try: temp = driver.find_element(by=By.XPATH, value = '/html/body/div/div[2]/div/div[2]/div') link = temp.find_element(by=By.CLASS_NAME, value='pagination_next').get_attribute('href') @@ -346,7 +349,6 @@ def isListingLink(url): # calling the parser to define the links def topicPages(html): soup = BeautifulSoup(html, "html.parser") - #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text) return cryptBB_links_parser(soup) diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py index 0957b76..318b04e 100644 --- a/Forums/CryptBB/parser.py +++ b/Forums/CryptBB/parser.py @@ -15,15 +15,15 @@ def cryptBB_description_parser(soup): # Fields to be parsed - topic = "-1" # topic name - user = [] # all users of each post - addDate = [] # all dated of each post - feedback = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format) - status = [] # all user's authority in each post such as (adm, member, dangerous) - reputation = [] # all user's karma in each post (usually found as a number) - sign = [] # all user's signature in each post (usually a standard message after the content of the post) - post = [] # all messages of each post - interest = [] # all user's interest in each post + topic = "-1" # 0 *topic name + user = [] # 1 *all users of each post + status = [] # 2 all user's authority in each post such as (adm, member, dangerous) + reputation = [] # 3 all user's karma in each post (usually found as a number) + interest = [] # 4 all user's interest in each post + sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) + post = [] # 6 all messages of each post + feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) + addDate = [] # 8 all dated of each post # Finding the topic (should be just one coming from the Listing Page) @@ -154,20 +154,6 @@ def cryptBB_description_parser(soup): feedback.append("-1") - ''' - except: - if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ": - user.append("-1") - status.append(-1) - interest.append(-1) - reputation.append(-1) - addDate.append(-1) - post.append("NO ACCESS TO THIS PAGE!") - sign.append(-1) - feedback.append(-1) - ''' - - # Populate the final variable (this should be a list with all fields scraped) row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) @@ -180,17 +166,17 @@ def cryptBB_description_parser(soup): def cryptBB_listing_parser(soup): - board = "-1" # board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - - nm = 0 # this variable should receive the number of topics - topic = [] # all topics - author = [] # all authors of each topic - views = [] # number of views of each topic - posts = [] # number of posts of each topic - addDate = [] # when the topic was created (difficult to find) - href = [] # this variable should receive all cleaned urls (we will use this to do the marge between - # Listing and Description pages) + nm = 0 # *this variable should receive the number of topics + forum = "CryptBB" # 0 *forum name + board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + topic = [] # 2 *all topics + author = [] # 3 *all authors of each topic + views = [] # 4 number of views of each topic + posts = [] # 5 number of posts of each topic + href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between + # Listing and Description pages) + addDate = [] # 7 when the topic was created (difficult to find) # Finding the board (should be just one) @@ -223,7 +209,6 @@ def cryptBB_listing_parser(soup): link = itopic.find('span', {"class": "subject_old"}).find('a').get('href') except: link = itopic.find('span',{"class": "subject_new"}).find('a').get('href') - link = cleanLink(link) href.append(link) # Finding the author of the topic @@ -245,7 +230,7 @@ def cryptBB_listing_parser(soup): addDate.append("-1") - return organizeTopics("CryptBB", nm, topic, board, author, views, posts, href, addDate) + return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) def cryptBB_links_parser(soup): diff --git a/Forums/DB_Connection/db_connection.py b/Forums/DB_Connection/db_connection.py index 619b85e..eeaf69b 100644 --- a/Forums/DB_Connection/db_connection.py +++ b/Forums/DB_Connection/db_connection.py @@ -2,15 +2,21 @@ __author__ = 'DarkWeb' import psycopg2 import traceback -import time -from datetime import date +import configparser def connectDataBase(): try: - return psycopg2.connect(host='localhost', user='postgres', password='password', dbname='darkweb_markets_forums') + config = configparser.ConfigParser() + config.read('../../setup.ini') + ip = config.get('PostgreSQL', 'ip') + username = config.get('PostgreSQL', 'username') + password = config.get('PostgreSQL', 'password') + database = config.get('PostgreSQL', 'database') + + return psycopg2.connect(host=ip, user=username, password=password, dbname=database) except: @@ -197,7 +203,7 @@ def getLastPost(cur): ''' -def create_forum(cur, row): +def create_forum(cur, row, url): forumId = verifyForum(cur, row[0]) @@ -207,7 +213,7 @@ def create_forum(cur, row): sql = "Insert into forums (forum_id, name_forum, url_forum, dateinserted_forum) Values (%s, %s, %s, %s)" - recset = [forumId, row[0], None, row[8]] + recset = [forumId, row[0], url, row[8]] cur.execute(sql, recset) diff --git a/Forums/Initialization/forums_mining.py b/Forums/Initialization/forums_mining.py index 71907e0..f431f97 100644 --- a/Forums/Initialization/forums_mining.py +++ b/Forums/Initialization/forums_mining.py @@ -9,10 +9,12 @@ from datetime import * from Forums.BestCardingWorld.crawler_selenium import crawler as crawlerBestCardingWorld from Forums.CryptBB.crawler_selenium import crawler as crawlerCryptBB from Forums.OnniForums.crawler_selenium import crawler as crawlerOnniForums -#from Forums.CrackingPro.crawler_selenium import crawler as crawlerCrackingPro +# from Forums.CrackingPro.crawler_selenium import crawler as crawlerCrackingPro import time +CURRENT_DATE = str("%02d" % date.today().month) + str("%02d" % date.today().day) + str("%04d" % date.today().year) + # reads list of marketplaces manually inputted def getForums(): @@ -30,8 +32,6 @@ def createDirectory(forum): pagesMainDir = '../' + forum else: pagesMainDir = '../' + forum + "/HTML_Pages" - # sharedFolderPath = r'\\VBoxSvr\VM_Files_(shared)' - # pagesMainDir = os.path.join(sharedFolderPath, 'HTML/Forums/' + forum + '/HTML_Pages') if not os.path.isdir(pagesMainDir): os.makedirs(pagesMainDir) @@ -58,7 +58,7 @@ def createRedditsSubdirectories(pagesMainDir): def createSubdirectories(pagesDir): - currentDateDir = pagesDir + '/' + str("%02d" %date.today().month) + str("%02d" %date.today().day) + str("%04d" %date.today().year) + currentDateDir = pagesDir + '/' + CURRENT_DATE if not os.path.isdir(currentDateDir): os.mkdir(currentDateDir) @@ -79,19 +79,19 @@ def createSubdirectories(pagesDir): os.mkdir(descReadDir) -#main method +# main method if __name__ == '__main__': - #assignment from forumsList.txt + # assignment from forumsList.txt forumsList = getForums() - #get forum from forumsList + # get forum from forumsList for forum in forumsList: forum = forum.replace('\n','') print("Creating listing and description directories ...") createDirectory(forum) - time.sleep(5) #wait for directories to be created + time.sleep(5) # wait for directories to be created input("Directories created successfully. Press ENTER to continue\n") if forum == "BestCardingWorld": @@ -103,7 +103,6 @@ if __name__ == '__main__': elif forum == "CrackingPro": crawlerCrackingPro() - print("Scraping process completed successfully!") diff --git a/Forums/Initialization/geckodriver.log b/Forums/Initialization/geckodriver.log index c206435..15928b8 100644 --- a/Forums/Initialization/geckodriver.log +++ b/Forums/Initialization/geckodriver.log @@ -3963,3 +3963,122 @@ JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}" JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 109: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS", key: "" JavaScript error: resource://gre/modules/ExtensionTelemetry.jsm, line 113: Error: TelemetryStopwatch: finishing nonexisting stopwatch. Histogram: "WEBEXT_CONTENT_SCRIPT_INJECTION_MS_BY_ADDONID", key: "{73a6fe31-595d-460b-a920-fcc0f8843232}" +1687240079948 geckodriver INFO Listening on 127.0.0.1:50448 +1687240084735 mozrunner::runner INFO Running command: "C:\\Users\\calsyslab\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "50449" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\CALSYS~1\\AppData\\Local\\Temp\\rust_mozprofileuYe2AP" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1687240085868 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:50449/devtools/browser/e85e6865-1f97-480a-8e46-778271184a87 +1687240090364 Marionette INFO Listening on port 50454 +1687240090846 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/, line 2: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 2: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 5: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 9: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\calsyslab\AppData\Local\Temp\rust_mozprofileuYe2AP\thumbnails) because it does not exist +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 6: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 19: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 25: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 6: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 19: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628&page=2, line 25: ReferenceError: $ is not defined +1687240218310 Marionette INFO Stopped listening on port 50454 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\calsyslab\AppData\Local\Temp\rust_mozprofileuYe2AP\thumbnails) because it does not exist +1687240220095 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1687240311209 geckodriver INFO Listening on 127.0.0.1:50519 +1687240315070 mozrunner::runner INFO Running command: "C:\\Users\\calsyslab\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "50520" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\CALSYS~1\\AppData\\Local\\Temp\\rust_mozprofiletzrkDs" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1687240315958 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:50520/devtools/browser/4b6276ea-c420-4b6d-b4bc-fda679f97800 +1687240317156 Marionette INFO Listening on port 50525 +1687240317256 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/, line 2: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 2: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 5: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/member.php?action=login, line 9: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 6: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 19: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=2628, line 25: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86&page=2, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16404, line 6: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/report.js?ver=1804, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/thread.js?ver=1809, line 4: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16404, line 19: ReferenceError: use_xmlhttprequest is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/showthread.php?tid=16404, line 25: ReferenceError: $ is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/forumdisplay.php?fid=86&page=2, line 3: ReferenceError: lang is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/jeditable/jeditable.min.js, line 38: ReferenceError: jQuery is not defined +JavaScript error: http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion/jscripts/inline_edit.js?ver=1808, line 6: ReferenceError: $ is not defined +1687240409940 Marionette INFO Stopped listening on port 50525 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] + +###!!! [Parent][MessageChannel] Error: (msgtype=0x140007,name=PBackgroundLSDatabase::Msg_RequestAllowToClose) Channel error: cannot send/recv + +[Parent 1036, IPC I/O Parent] WARNING: file /var/tmp/build/firefox-b6010b1466c9/ipc/chromium/src/base/process_util_win.cc:167 +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\calsyslab\AppData\Local\Temp\rust_mozprofiletzrkDs\thumbnails) because it does not exist + +###!!! [Child][MessageChannel] Error: (msgtype=0x5D0005,name=PImageBridge::Msg_WillClose) Channel error: cannot send/recv + +1687240410572 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 23d97f1..2efb84d 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -9,7 +9,7 @@ from Forums.BestCardingWorld.parser import * from Forums.CryptBB.parser import * from Forums.Classifier.classify_product import predict -#from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi +# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi # determines if forum is russian, not really used now but maybe later @@ -62,9 +62,9 @@ def getPosts(posts): #uses db connection , another program, methods to persists values to the correct categories #@param: row is the list of entries for this instance, cur is the db connection object -def persist_data(row, cur): +def persist_data(url, row, cur): - forum = create_forum(cur, row) + forum = create_forum(cur, row, url) board = create_board(cur, row, forum) @@ -77,15 +77,13 @@ def persist_data(row, cur): #main method for this program, what actually gets the parsed info from the parser, and persists them into the db #calls the different parser methods here depending on the type of html page -def new_parse(forum, createLog): +def new_parse(forum, url, createLog): - print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.") - - crawlerDate = date.today() + from Forums.Initialization.forums_mining import CURRENT_DATE - ini = time.time() + print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.") - global site + # ini = time.time() # Connecting to the database con = connectDataBase() @@ -96,27 +94,26 @@ def new_parse(forum, createLog): nError = 0 - lines = [] #lines.clear() - lns = [] #lns.clear() + lines = [] # listing pages + lns = [] # description pages detPage = {} - rw = [] # Creating the log file for each Forum if createLog: - if not os.path.exists("./" + forum + "/Logs/" + forum + "_" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + ".log"): - logFile = open("./" + forum + "/Logs/" + forum + "_" + str("%02d" %crawlerDate.today().month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + ".log", "w") + if not os.path.exists("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log"): + logFile = open("./" + forum + "/Logs/" + forum + "_" + CURRENT_DATE + ".log", "w") else: - print("Files of the date " + str("%02d" %crawlerDate.today().month) + str("%02d" %crawlerDate.today().day) + str("%04d" %crawlerDate.today().year) + - " from the Forum " + forum + " were already read. Delete the referent information in the Data Base and also delete the log file " - "in the _Logs folder to read files from this Forum of this date again.") + print("Files of the date " + CURRENT_DATE + " from the Forum " + forum + + " were already read. Delete the referent information in the Data Base and also delete the log file" + " in the _Logs folder to read files from this Forum of this date again.") raise SystemExit # Reading the Listing Html Pages - for fileListing in glob.glob(os.path.join (os.getcwd().replace("Initialization","") + forum + "\\HTML_Pages\\" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + "\\Listing" ,'*.html')): + for fileListing in glob.glob(os.path.join("..\\" + forum + "\\HTML_Pages\\" + CURRENT_DATE + "\\Listing", '*.html')): lines.append(fileListing) # Reading the Description Html Pages - for fileDescription in glob.glob(os.path.join (os.getcwd().replace("Initialization","") + forum + "\\HTML_Pages\\" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + "\\Description" ,'*.html')): + for fileDescription in glob.glob(os.path.join("..\\" + forum + "\\HTML_Pages\\" + CURRENT_DATE + "\\Description" ,'*.html')): lns.append(fileDescription) # Parsing the Description Pages and put the tag's content into a dictionary (Hash table) @@ -218,9 +215,7 @@ def new_parse(forum, createLog): rec = rec.split(',') # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip() - # key = rec[16] - url = ''.join(e for e in rec[6] if e.isalnum()) - key = u"Url:" + url + key = u"Url:" + cleanLink(rec[6]) if key in detPage: @@ -237,7 +232,7 @@ def new_parse(forum, createLog): # Persisting the information in the database try: - persist_data(tuple(rec), cur) + persist_data(url, tuple(rec), cur) con.commit() except: diff --git a/Forums/Utilities/utilities.py b/Forums/Utilities/utilities.py index 9d64cb6..d8ca9eb 100644 --- a/Forums/Utilities/utilities.py +++ b/Forums/Utilities/utilities.py @@ -160,7 +160,7 @@ def cleanLink(originalLink): return originalLink -def organizeTopics(forum, nm, topic, board, author, views, posts, href, addDate): +def organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate): day = time.strftime("%m/%d/%Y") ahora = time.strftime("%I:%M:%S") diff --git a/MarketPlaces/DB_Connection/db_connection.py b/MarketPlaces/DB_Connection/db_connection.py index 9cabf34..97296e3 100644 --- a/MarketPlaces/DB_Connection/db_connection.py +++ b/MarketPlaces/DB_Connection/db_connection.py @@ -2,15 +2,21 @@ __author__ = 'DarkWeb' import psycopg2 import traceback -import time -from datetime import date +import configparser def connectDataBase(): try: - return psycopg2.connect(host='localhost', user='postgres', password='password', dbname='darkweb_markets_forums') + config = configparser.ConfigParser() + config.read('../../setup.ini') + ip = config.get('PostgreSQL', 'ip') + username = config.get('PostgreSQL', 'username') + password = config.get('PostgreSQL', 'password') + database = config.get('PostgreSQL', 'database') + + return psycopg2.connect(host=ip, user=username, password=password, dbname=database) except: @@ -95,7 +101,7 @@ def getLastVendor(cur): print (trace) -def create_marketPlace(cur, row): +def create_marketPlace(cur, row, url): marketId = verifyMarketPlace(cur, row[0]) @@ -105,7 +111,7 @@ def create_marketPlace(cur, row): sql = "Insert into marketplaces (market_id, name_market, url_market, dateinserted_market) " \ "Values (%s, %s, %s, %s)" - recset = [marketId, row[0], None, row[21]] + recset = [marketId, row[0], url, row[21]] cur.execute(sql, recset) diff --git a/MarketPlaces/Initialization/geckodriver.log b/MarketPlaces/Initialization/geckodriver.log index 51d45ff..7f95777 100644 --- a/MarketPlaces/Initialization/geckodriver.log +++ b/MarketPlaces/Initialization/geckodriver.log @@ -6073,3 +6073,35 @@ unwatchForTargets()@TargetList.jsm:37 destructor()@TargetList.jsm:109 stop()@CDP.jsm:104 close()@RemoteAgent.jsm:138 +1687245533907 geckodriver INFO Listening on 127.0.0.1:62051 +1687245536832 mozrunner::runner INFO Running command: "C:\\Users\\calsyslab\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "62052" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\CALSYS~1\\AppData\\Local\\Temp\\rust_mozprofileuMGaeY" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1687245537956 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:62052/devtools/browser/9cf17e56-2fb1-468d-b65e-15c4de4eaa64 +1687245540759 Marionette INFO Listening on port 49935 +1687245540897 RemoteAgent WARN TLS certificate errors will be ignored for this session +1687245639406 Marionette INFO Stopped listening on port 49935 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PageThumbs.jsm, line 709: AbortError: IOUtils.profileBeforeChange getter: IOUtils: profileBeforeChange phase has already finished +1687245650576 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 87f811c..b85ae71 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -ThiefWorld \ No newline at end of file +Tor2door \ No newline at end of file diff --git a/MarketPlaces/Initialization/markets_mining.py b/MarketPlaces/Initialization/markets_mining.py index 42bb51c..3073612 100644 --- a/MarketPlaces/Initialization/markets_mining.py +++ b/MarketPlaces/Initialization/markets_mining.py @@ -12,6 +12,8 @@ from MarketPlaces.ThiefWorld.crawler_selenium import crawler as crawlerThiefWorl import time +CURRENT_DATE = str("%02d" % date.today().month) + str("%02d" % date.today().day) + str("%04d" % date.today().year) + # reads list of marketplaces def getMarkets(): @@ -26,12 +28,10 @@ def createDirectory(mkt): # Package should already be there, holding crawler and parser pagesDir = '../' + mkt + '/HTML_Pages' - # sharedFolderPath = r'\\VBoxSvr\VM_Files_(shared)' - # pagesDir = os.path.join(sharedFolderPath, 'HTML/MarketPlaces/' + mkt + '/HTML_Pages') if not os.path.isdir(pagesDir): os.makedirs(pagesDir) - currentDateDir = pagesDir + '/' + str("%02d" %date.today().month) + str("%02d" %date.today().day) + str("%04d" %date.today().year) + currentDateDir = pagesDir + '/' + CURRENT_DATE if not os.path.isdir(currentDateDir): os.mkdir(currentDateDir) diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 2389834..de13899 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -60,55 +60,52 @@ def mergePages(rmm, rec): return rec -def persist_data(row, cur): +def persist_data(url, row, cur): - marketPlace = create_marketPlace(cur, row) + marketPlace = create_marketPlace(cur, row, url) vendor = create_vendor(cur, row, marketPlace) create_items(cur, row, marketPlace, vendor) -def new_parse(marketPlace, createLog): +def new_parse(marketPlace, url, createLog): - print("Parsing the " + marketPlace + " marketplace and conduct data classification to store the information in the database.") + from MarketPlaces.Initialization.markets_mining import CURRENT_DATE - crawlerDate = date.today() + print("Parsing the " + marketPlace + " marketplace and conduct data classification to store the information in the database.") # ini = time.time() - global site - - #Connecting to the database + # Connecting to the database con = connectDataBase() cur = con.cursor() - #Creating the tables (The database should be created manually) + # Creating the tables (The database should be created manually) create_database(cur, con) nError = 0 - lines = [] #lines.clear() - lns = [] #lns.clear() + lines = [] # listing pages + lns = [] # description pages detPage = {} - rw = [] #Creating the log file for each Market Place if createLog: - if not os.path.exists("./" + marketPlace + "/Logs/" + marketPlace + "_" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + ".log"): - logFile = open("./" + marketPlace + "/Logs/" + marketPlace + "_" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + ".log", "w") + if not os.path.exists("./" + marketPlace + "/Logs/" + marketPlace + "_" + CURRENT_DATE + ".log"): + logFile = open("./" + marketPlace + "/Logs/" + marketPlace + "_" + CURRENT_DATE + ".log", "w") else: - print("Files of the date " + str("%02d" %crawlerDate.month) + "/" + str("%02d" %crawlerDate.day) + "/" + str("%04d" %crawlerDate.year) + - " from the Market Place " + marketPlace + " were already read. Delete the referent information in the Data Base and also delete the log file " - "in the _Logs folder to read files from this Market Place of this date again.") + print("Files of the date " + CURRENT_DATE + " from the Market Place " + marketPlace + + " were already read. Delete the referent information in the Data Base and also delete the log file" + " in the _Logs folder to read files from this Market Place of this date again.") raise SystemExit # Reading the Listing Html Pages - for fileListing in glob.glob(os.path.join (os.getcwd().replace("Initialization","") + marketPlace + "\\HTML_Pages\\" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + "\\Listing" ,'*.html')): + for fileListing in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\" + CURRENT_DATE + "\\Listing", '*.html')): lines.append(fileListing) # Reading the Description Html Pages - for fileDescription in glob.glob(os.path.join (os.getcwd().replace("Initialization","") + marketPlace + "\\HTML_Pages\\" + str("%02d" %crawlerDate.month) + str("%02d" %crawlerDate.day) + str("%04d" %crawlerDate.year) + "\\Description" ,'*.html')): + for fileDescription in glob.glob(os.path.join("..\\" + marketPlace + "\\HTML_Pages\\" + CURRENT_DATE + "\\Description", '*.html')): lns.append(fileDescription) # Parsing the Description Pages and put the tag's content into a dictionary (Hash table) @@ -214,9 +211,7 @@ def new_parse(marketPlace, createLog): # key = rec[23] # key = u"Pr:" + rec[1].upper()[:list_lim1] + u" Vendor:" + rec[18].upper()[:list_lim2] - # key = u"Pr:" + rec[1].upper() - url = ''.join(e for e in rec[20] if e.isalnum()) - key = u"Url:" + url + key = u"Url:" + cleanLink(rec[20]) # if the associated description page is parsed if key in detPage: @@ -233,7 +228,7 @@ def new_parse(marketPlace, createLog): # Persisting the information in the database try: - persist_data(tuple(rec), cur) + persist_data(url, tuple(rec), cur) con.commit() except: diff --git a/MarketPlaces/ThiefWorld/crawler_selenium.py b/MarketPlaces/ThiefWorld/crawler_selenium.py index 34d606d..3d3c28a 100644 --- a/MarketPlaces/ThiefWorld/crawler_selenium.py +++ b/MarketPlaces/ThiefWorld/crawler_selenium.py @@ -15,14 +15,17 @@ from selenium.webdriver.common.by import By from PIL import Image import urllib.parse as urlparse -import os, time +import os, re, time from datetime import date import subprocess +import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.ThiefWorld.parser import thiefworld_links_parser from MarketPlaces.Utilities.utilities import cleanHTML +config = configparser.ConfigParser() +config.read('../../setup.ini') counter = 1 baseURL = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/' @@ -31,7 +34,7 @@ baseURL = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): opentor() - mktName = getMKTName() + # mktName = getMKTName() driver = getAccess() if driver != 'down': @@ -42,7 +45,7 @@ def startCrawling(): print(driver.current_url, e) closetor(driver) - #new_parse(mktName, False) + # new_parse(mktName, False) # Opens Tor Browser @@ -50,8 +53,7 @@ def startCrawling(): def opentor(): global pid print("Connecting Tor...") - path = open('../../path.txt').readline().strip() - pro = subprocess.Popen(path) + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) pid = pro.pid time.sleep(7.5) input('Tor Connected. Press ENTER to continue\n') @@ -61,7 +63,7 @@ def opentor(): # Returns the name of the website #return: name of site in string type def getMKTName(): - name = 'TheifWorld' + name = 'ThiefWorld' return name @@ -87,12 +89,9 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): - file = open('../../path.txt', 'r') - lines = file.readlines() + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - ff_binary = FirefoxBinary(lines[0].strip()) - - ff_prof = FirefoxProfile(lines[1].strip()) + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof.set_preference("places.history.enabled", False) ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) @@ -114,7 +113,7 @@ def createFFDriver(): ff_prof.set_preference("javascript.enabled", False) ff_prof.update_preferences() - service = Service(lines[2].strip()) + service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) @@ -162,15 +161,12 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name #@param: raw url as crawler crawls through every site def getFullPathName(url): + from MarketPlaces.Initialization.markets_mining import CURRENT_DATE fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\ThiefWorld\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + fullPath = r'..\ThiefWorld\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' else: - fullPath = r'..\ThiefWorld\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + fullPath = r'..\ThiefWorld\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' return fullPath @@ -191,14 +187,15 @@ def getNameFromURL(url): #as you can see they are categories of products def getInterestedLinks(): links = [] + # Hacking and DDOS links.append('http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/35') # # Carding Manuals - links.append('http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/20') + # links.append('http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/20') # # Software - links.append('http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/37') - # #Database - links.append('http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/38') + # links.append('http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/37') + # # Database + # links.append('http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion/catalog/38') return links @@ -238,13 +235,13 @@ def crawlForum(driver): savePage(driver.page_source, item) driver.back() - # # comment out - # break - # - # # # comment out - # if count == 1: - # count = 0 - # break + # comment out + break + + # comment out + if count == 1: + count = 0 + break try: link = driver.find_element(by=By.XPATH, value= diff --git a/MarketPlaces/Tor2door/crawler_selenium.py b/MarketPlaces/Tor2door/crawler_selenium.py index b7e7937..baef719 100644 --- a/MarketPlaces/Tor2door/crawler_selenium.py +++ b/MarketPlaces/Tor2door/crawler_selenium.py @@ -15,41 +15,42 @@ from selenium.webdriver.support.ui import WebDriverWait from PIL import Image import urllib.parse as urlparse -import os, time -from datetime import date +import os, re, time import subprocess +import configparser from bs4 import BeautifulSoup from MarketPlaces.Initialization.prepare_parser import new_parse from MarketPlaces.Tor2door.parser import tor2door_links_parser from MarketPlaces.Utilities.utilities import cleanHTML +config = configparser.ConfigParser() +config.read('../../setup.ini') counter = 1 -baseURL = 'http://http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion' +baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion' # Opens Tor Browser, crawls the website def startCrawling(): - opentor() - # marketName = getMarketName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) - - # new_parse(marketName, False) + # opentor() + marketName = getMarketName() + # driver = getAccess() + # + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) + # + new_parse(marketName, baseURL, False) # Opens Tor Browser def opentor(): global pid print("Connecting Tor...") - path = open('../../path.txt').readline().strip() - pro = subprocess.Popen(path) + pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path')) pid = pro.pid time.sleep(7.5) input('Tor Connected. Press ENTER to continue\n') @@ -130,12 +131,9 @@ def closetor(driver): # Creates FireFox 'driver' and configure its 'Profile' # to use Tor proxy and socket def createFFDriver(): - file = open('../../path.txt', 'r') - lines = file.readlines() - - ff_binary = FirefoxBinary(lines[0].strip()) + ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path')) - ff_prof = FirefoxProfile(lines[1].strip()) + ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path')) ff_prof.set_preference("places.history.enabled", False) ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True) ff_prof.set_preference("privacy.clearOnShutdown.passwords", True) @@ -157,7 +155,7 @@ def createFFDriver(): ff_prof.set_preference("javascript.enabled", False) ff_prof.update_preferences() - service = Service(executable_path=lines[2].strip()) + service = Service(config.get('TOR', 'geckodriver_path')) driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service) @@ -186,15 +184,12 @@ def savePage(page, url): # Gets the full path of the page to be saved along with its appropriate file name def getFullPathName(url): + from MarketPlaces.Initialization.markets_mining import CURRENT_DATE fileName = getNameFromURL(url) if isDescriptionLink(url): - fullPath = r'..\Tor2door\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Description\\' + fileName + '.html' + fullPath = r'..\Tor2door\HTML_Pages\\' + CURRENT_DATE + r'\\Description\\' + fileName + '.html' else: - fullPath = r'..\Tor2door\HTML_Pages\\' + str( - "%02d" % date.today().month) + str("%02d" % date.today().day) + str( - "%04d" % date.today().year) + r'\\' + r'Listing\\' + fileName + '.html' + fullPath = r'..\Tor2door\HTML_Pages\\' + CURRENT_DATE + r'\\Listing\\' + fileName + '.html' return fullPath diff --git a/MarketPlaces/Tor2door/parser.py b/MarketPlaces/Tor2door/parser.py index 105fc99..f4a4c07 100644 --- a/MarketPlaces/Tor2door/parser.py +++ b/MarketPlaces/Tor2door/parser.py @@ -12,10 +12,10 @@ def tor2door_description_parser(soup): # Fields to be parsed - vendor = "-1" # 0 Vendor_Name + vendor = "-1" # 0 *Vendor_Name success = "-1" # 1 Vendor_Successful_Transactions rating_vendor = "-1" # 2 Vendor_Rating - name = "-1" # 3 Product_Name + name = "-1" # 3 *Product_Name describe = "-1" # 4 Product_Description CVE = "-1" # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) MS = "-1" # 6 Product_MS_Classification (Microsoft Security) @@ -118,28 +118,28 @@ def tor2door_description_parser(soup): def tor2door_listing_parser(soup): # Fields to be parsed - nm = 0 # Total_Products (Should be Integer) - mktName = "Tor2door" # 0 Marketplace_Name - vendor = [] # 18 Vendor y - rating_vendor = [] # 19 Vendor_Rating - success = [] # 20 Vendor_Successful_Transactions - name = [] # 1 Product_Name y - CVE = [] # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures) - MS = [] # 3 Product_MS_Classification (Microsoft Security) - category = [] # 4 Product_Category y - describe = [] # 5 Product_Description - views = [] # 7 Product_Number_Of_Views - reviews = [] # 7 Product_Number_Of_Reviews - rating_item = [] # 8 Product_Rating - addDate = [] # 9 Product_AddDate - BTC = [] # 11 Product_BTC_SellingPrice - USD = [] # 12 Product_USD_SellingPrice y - EURO = [] # 13 Product_EURO_SellingPrice - sold = [] # 14 Product_QuantitySold - qLeft =[] # 15 Product_QuantityLeft - shipFrom = [] # 16 Product_ShippedFrom - shipTo = [] # 17 Product_ShippedTo - href = [] # 24 Product_Links + nm = 0 # *Total_Products (Should be Integer) + mktName = "Tor2door" # 0 *Marketplace_Name + vendor = [] # 1 *Vendor y + rating_vendor = [] # 2 Vendor_Rating + success = [] # 3 Vendor_Successful_Transactions + name = [] # 4 *Product_Name y + CVE = [] # 5 Product_CVE_Classification (Common Vulnerabilities and Exposures) + MS = [] # 6 Product_MS_Classification (Microsoft Security) + category = [] # 7 Product_Category y + describe = [] # 8 Product_Description + views = [] # 9 Product_Number_Of_Views + reviews = [] # 10 Product_Number_Of_Reviews + rating_item = [] # 11 Product_Rating + addDate = [] # 12 Product_AddDate + BTC = [] # 13 Product_BTC_SellingPrice + USD = [] # 14 Product_USD_SellingPrice y + EURO = [] # 15 Product_EURO_SellingPrice + sold = [] # 16 Product_QuantitySold + qLeft =[] # 17 Product_QuantityLeft + shipFrom = [] # 18 Product_ShippedFrom + shipTo = [] # 19 Product_ShippedTo + href = [] # 20 Product_Links listing = soup.findAll('div', {"class": "card product-card mb-3"}) @@ -160,7 +160,6 @@ def tor2door_listing_parser(soup): # Adding the url to the list of urls link = bae[0].get('href') - link = cleanLink(link) href.append(link) # Finding Product Name diff --git a/path.txt b/path.txt deleted file mode 100644 index 3992963..0000000 --- a/path.txt +++ /dev/null @@ -1,3 +0,0 @@ -C:\Users\Helium\Desktop\Tor Browser\Browser\firefox.exe -C:\Users\Helium\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default -C:\Users\Helium\PycharmProjects\dw_pipeline_test\selenium\geckodriver.exe \ No newline at end of file diff --git a/setup.ini b/setup.ini new file mode 100644 index 0000000..38c2347 --- /dev/null +++ b/setup.ini @@ -0,0 +1,14 @@ +[TOR] +firefox_binary_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\firefox.exe +firefox_profile_path = C:\Users\calsyslab\Desktop\Tor Browser\Browser\TorBrowser\Data\Browser\profile.default +geckodriver_path = C:\Users\calsyslab\Projects\dw_pipeline_test\selenium\geckodriver.exe + +[Project] +project_directory = C:\Users\calsyslab\Projects\dw_pipeline_test +shared_folder = \\VBoxSvr\VM_Files_(shared) + +[PostgreSQL] +ip = localhost +username = postgres +password = password +database = darkweb_markets_forums \ No newline at end of file