diff --git a/.gitignore b/.gitignore index fe0eeb1..dbe1559 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ /shelf/ .idea/workspace.xml selenium/geckodriver.exe +__pycache__ setup.ini *.html *.log diff --git a/Forums/AbyssForum/__pycache__/crawler_selenium.cpython-310.pyc b/Forums/AbyssForum/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index 5512e92..0000000 Binary files a/Forums/AbyssForum/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/Forums/AbyssForum/__pycache__/crawler_selenium.cpython-311.pyc b/Forums/AbyssForum/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index d08a6e5..0000000 Binary files a/Forums/AbyssForum/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/Forums/AbyssForum/__pycache__/parser.cpython-310.pyc b/Forums/AbyssForum/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 7115900..0000000 Binary files a/Forums/AbyssForum/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/Forums/AbyssForum/__pycache__/parser.cpython-311.pyc b/Forums/AbyssForum/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index 55a8281..0000000 Binary files a/Forums/AbyssForum/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/Forums/Altenens/__pycache__/crawler_selenium.cpython-310.pyc b/Forums/Altenens/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index 8cdd037..0000000 Binary files a/Forums/Altenens/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/Forums/Altenens/__pycache__/crawler_selenium.cpython-311.pyc b/Forums/Altenens/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index 38b5dbc..0000000 Binary files a/Forums/Altenens/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/Forums/Altenens/__pycache__/parser.cpython-310.pyc b/Forums/Altenens/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index b95b89c..0000000 Binary files a/Forums/Altenens/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/Forums/Altenens/__pycache__/parser.cpython-311.pyc b/Forums/Altenens/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index 3f1c5d5..0000000 Binary files a/Forums/Altenens/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/Forums/CryptBB/__pycache__/__init__.cpython-311.pyc b/Forums/CryptBB/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index c09fee6..0000000 Binary files a/Forums/CryptBB/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/Forums/CryptBB/__pycache__/crawler_selenium.cpython-310.pyc b/Forums/CryptBB/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index b64d5e7..0000000 Binary files a/Forums/CryptBB/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/Forums/CryptBB/__pycache__/crawler_selenium.cpython-311.pyc b/Forums/CryptBB/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index 3f5473f..0000000 Binary files a/Forums/CryptBB/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/Forums/CryptBB/__pycache__/parser.cpython-310.pyc b/Forums/CryptBB/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 9086c35..0000000 Binary files a/Forums/CryptBB/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/Forums/CryptBB/__pycache__/parser.cpython-311.pyc b/Forums/CryptBB/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index dc04000..0000000 Binary files a/Forums/CryptBB/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-310.pyc b/Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index e28f1a2..0000000 Binary files a/Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-311.pyc b/Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index 80d392b..0000000 Binary files a/Forums/HiddenAnswers/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/Forums/HiddenAnswers/__pycache__/parser.cpython-310.pyc b/Forums/HiddenAnswers/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 1444998..0000000 Binary files a/Forums/HiddenAnswers/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/Forums/HiddenAnswers/__pycache__/parser.cpython-311.pyc b/Forums/HiddenAnswers/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index d9c9fb5..0000000 Binary files a/Forums/HiddenAnswers/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/Forums/Initialization/__pycache__/__init__.cpython-310.pyc b/Forums/Initialization/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 481f9a7..0000000 Binary files a/Forums/Initialization/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/Forums/Initialization/__pycache__/__init__.cpython-311.pyc b/Forums/Initialization/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index c6c10a4..0000000 Binary files a/Forums/Initialization/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/Forums/Initialization/__pycache__/forums_mining.cpython-310.pyc b/Forums/Initialization/__pycache__/forums_mining.cpython-310.pyc deleted file mode 100644 index f8f47ca..0000000 Binary files a/Forums/Initialization/__pycache__/forums_mining.cpython-310.pyc and /dev/null differ diff --git a/Forums/Initialization/__pycache__/forums_mining.cpython-311.pyc b/Forums/Initialization/__pycache__/forums_mining.cpython-311.pyc deleted file mode 100644 index 294aebc..0000000 Binary files a/Forums/Initialization/__pycache__/forums_mining.cpython-311.pyc and /dev/null differ diff --git a/Forums/Initialization/__pycache__/prepare_parser.cpython-310.pyc b/Forums/Initialization/__pycache__/prepare_parser.cpython-310.pyc deleted file mode 100644 index 158eea1..0000000 Binary files a/Forums/Initialization/__pycache__/prepare_parser.cpython-310.pyc and /dev/null differ diff --git a/Forums/Initialization/__pycache__/prepare_parser.cpython-311.pyc b/Forums/Initialization/__pycache__/prepare_parser.cpython-311.pyc deleted file mode 100644 index 121809c..0000000 Binary files a/Forums/Initialization/__pycache__/prepare_parser.cpython-311.pyc and /dev/null differ diff --git a/Forums/Initialization/forums_mining.py b/Forums/Initialization/forums_mining.py index 5fcf17e..6c76692 100644 --- a/Forums/Initialization/forums_mining.py +++ b/Forums/Initialization/forums_mining.py @@ -99,9 +99,9 @@ if __name__ == '__main__': forum = forum.replace('\n','') print("Creating listing and description directories ... for " + forum) - createDirectory(forum) - time.sleep(5) # wait for directories to be created - input("Directories created successfully. Press ENTER to continue\n") + # createDirectory(forum) + # time.sleep(5) # wait for directories to be created + # input("Directories created successfully. Press ENTER to continue\n") if forum == "BestCardingWorld": diff --git a/Forums/Initialization/geckodriver.log b/Forums/Initialization/geckodriver.log index bfb039a..8d4ccb3 100644 --- a/Forums/Initialization/geckodriver.log +++ b/Forums/Initialization/geckodriver.log @@ -11198,3 +11198,80 @@ unwatchForTargets()@TargetList.jsm:37 destructor()@TargetList.jsm:109 stop()@CDP.jsm:104 close()@RemoteAgent.jsm:138 +1689622469580 geckodriver INFO Listening on 127.0.0.1:58866 +1689622474728 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "58867" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofile5gOLDP" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689622475417 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:58867/devtools/browser/9a3a8de2-439e-425e-b415-f975abd86b65 +1689622476941 Marionette INFO Listening on port 58873 +1689622477054 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofile5gOLDP\thumbnails) because it does not exist +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: undefined, line 0: Error: Missing host permission for the tab +JavaScript error: undefined, line 0: Error: Missing host permission for the tab +1689624030995 Marionette INFO Stopped listening on port 58873 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofile5gOLDP\thumbnails) because it does not exist + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1689624031467 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689624276336 geckodriver INFO Listening on 127.0.0.1:59792 +1689624280979 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "59793" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofileSTe5EC" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689624281509 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:59793/devtools/browser/222a61fa-a958-4978-8048-bb632f658131 +1689624283001 Marionette INFO Listening on port 59799 +1689624283405 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689624692072 Marionette INFO Stopped listening on port 59799 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofileSTe5EC\thumbnails) because it does not exist + +###!!! [Parent][RunMessage] Error: Channel closing: too late to send/recv, messages will be lost + +1689624692916 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index fe8be28..4c6a407 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -9,6 +9,7 @@ from Forums.BestCardingWorld.parser import * from Forums.CryptBB.parser import * from Forums.OnniForums.parser import * from Forums.Altenens.parser import * +from Forums.Procrax.parser import * from Forums.Classifier.classify_product import predict # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi @@ -154,6 +155,8 @@ def new_parse(forum, url, createLog): rmm = onniForums_description_parser(soup) elif forum == "Altenens": rmm = altenens_description_parser(soup) + elif forum == "Procrax": + rmm = procrax_description_parser(soup) # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip() key = u"Url:" + os.path.basename(line2).replace(".html", "") @@ -233,6 +236,8 @@ def new_parse(forum, url, createLog): rw = onniForums_listing_parser(soup) elif forum == "Altenens": rw = altenens_listing_parser(soup) + elif forum == "Procrax": + rw = procrax_listing_parser(soup) except: diff --git a/Forums/OnniForums/__pycache__/crawler_selenium.cpython-310.pyc b/Forums/OnniForums/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index a7ffacc..0000000 Binary files a/Forums/OnniForums/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/Forums/OnniForums/__pycache__/crawler_selenium.cpython-311.pyc b/Forums/OnniForums/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index 8278bd9..0000000 Binary files a/Forums/OnniForums/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/Forums/OnniForums/__pycache__/parser.cpython-310.pyc b/Forums/OnniForums/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index f08bc35..0000000 Binary files a/Forums/OnniForums/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/Forums/OnniForums/__pycache__/parser.cpython-311.pyc b/Forums/OnniForums/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index cdefc99..0000000 Binary files a/Forums/OnniForums/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/Forums/OnniForums/__pycache__/parser_script.cpython-311.pyc b/Forums/OnniForums/__pycache__/parser_script.cpython-311.pyc deleted file mode 100644 index d03ffa2..0000000 Binary files a/Forums/OnniForums/__pycache__/parser_script.cpython-311.pyc and /dev/null differ diff --git a/Forums/Procrax/crawler_selenium.py b/Forums/Procrax/crawler_selenium.py index 99985b5..9d37eae 100644 --- a/Forums/Procrax/crawler_selenium.py +++ b/Forums/Procrax/crawler_selenium.py @@ -26,24 +26,28 @@ from Forums.Procrax.parser import procrax_links_parser from Forums.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'https://procrax.cx/' +BASE_URL = 'https://procrax.cx/' +FORUM_NAME = 'Procrax' # Opens Tor Browser, crawls the website def startCrawling(): - opentor() - # forumName = getForumName() - driver = getAccess() + # opentor() + # driver = getAccess() - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) - # new_parse(forumName, False) + new_parse( + forum=FORUM_NAME, + url=BASE_URL, + createLog=False + ) # Opens Tor Browser @@ -139,10 +143,9 @@ def createFFDriver(): return driver def getAccess(): - url = getFixedURL() driver = createFFDriver() try: - driver.get(url)# open url in browser + driver.get(BASE_URL)# open url in browser return driver except: driver.close()# close tab @@ -162,7 +165,7 @@ def savePage(page, url): def getFullPathName(url): from Forums.Initialization.forums_mining import config, CURRENT_DATE - mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages") + mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + FORUM_NAME + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') @@ -185,17 +188,17 @@ def getInterestedLinks(): links = [] # # general hacking - # links.append('https://procrax.cx/forums/general-hacking.24/') + links.append('https://procrax.cx/forums/general-hacking.24/') # # hacking security tools - # links.append('https://procrax.cx/forums/hacking-security-tools.20/') + links.append('https://procrax.cx/forums/hacking-security-tools.20/') # # hacktube - # links.append('https://procrax.cx/forums/hacktube.22/') + links.append('https://procrax.cx/forums/hacktube.22/') # # cardable # links.append('https://procrax.cx/forums/cardable-websites.28/') # # tools # links.append('https://procrax.cx/forums/tools-bots-validators.73/') # general forum - links.append('https://procrax.cx/forums/forum-discussions-updates.7/') + # links.append('https://procrax.cx/forums/forum-discussions-updates.7/') return links @@ -229,7 +232,7 @@ def crawlForum(driver): page = topic while has_next_topic_page: - itemURL = urlparse.urljoin(baseURL, str(page)) + itemURL = urlparse.urljoin(BASE_URL, str(page)) try: driver.get(itemURL) except: @@ -237,8 +240,8 @@ def crawlForum(driver): savePage(driver.page_source, topic + f"page{counter}") # very important # comment out - if counter == 2: - break + # if counter == 2: + # break try: page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') @@ -254,10 +257,10 @@ def crawlForum(driver): driver.back() # comment out - break + # break # comment out - if count == 1: + if count == 20: break try: diff --git a/Forums/Procrax/parser.py b/Forums/Procrax/parser.py index 30cc2e8..7c9c463 100644 --- a/Forums/Procrax/parser.py +++ b/Forums/Procrax/parser.py @@ -7,11 +7,12 @@ from datetime import timedelta import re # Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, ResultSet, Tag # This is the method to parse the Description Pages (one page to each topic in the Listing Pages) -def cryptBB_description_parser(soup): + +def procrax_description_parser(soup: Tag): # Fields to be parsed @@ -27,146 +28,36 @@ def cryptBB_description_parser(soup): # Finding the topic (should be just one coming from the Listing Page) - li = soup.find("td", {"class": "thead"}).find('strong') + li = soup.find("h1", {"class": "p-title-value"}) topic = li.text - topic = re.sub("\[\w*\]", '', topic) - - topic = topic.replace(",","") - topic = topic.replace("\n","") - topic = cleanString(topic.strip()) - - # Finding the repeated tag that corresponds to the listing of posts - - # try: - posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all( - 'div', {"class": "post"}) - - # For each message (post), get all the fields we are interested to: - - for ipost in posts: - - # Finding a first level of the HTML page - - post_wrapper = ipost.find('span', {"class": "largetext"}) - - # Finding the author (user) of the post - - author = post_wrapper.text.strip() - user.append(cleanString(author)) # Remember to clean the problematic characters - - # Finding the status of the author - - smalltext = ipost.find('div', {"class": "post_author"}) - - ''' - # Testing here two possibilities to find this status and combine them - if ipost.find('div', {"class": "deleted_post_author"}): - status.append(-1) - interest.append(-1) - reputation.append(-1) - addDate.append(-1) - post.append("THIS POST HAS BEEN REMOVED!") - sign.append(-1) - feedback.append(-1) - continue - ''' - - # CryptBB does have membergroup and postgroup - - membergroup = smalltext.find('div', {"class": "profile-rank"}) - postgroup = smalltext.find('div', {"class": "postgroup"}) - if membergroup != None: - membergroup = membergroup.text.strip() - if postgroup != None: - postgroup = postgroup.text.strip() - membergroup = membergroup + " - " + postgroup - else: - if postgroup != None: - membergroup = postgroup.text.strip() - else: - membergroup = "-1" - status.append(cleanString(membergroup)) - - # Finding the interest of the author - # CryptBB does not have blurb - blurb = smalltext.find('li', {"class": "blurb"}) - if blurb != None: - blurb = blurb.text.strip() - else: - blurb = "-1" - interest.append(cleanString(blurb)) - - # Finding the reputation of the user - # CryptBB does have reputation - author_stats = smalltext.find('div', {"class": "author_statistics"}) - karma = author_stats.find('strong') - if karma != None: - karma = karma.text - karma = karma.replace("Community Rating: ", "") - karma = karma.replace("Karma: ", "") - karma = karma.strip() - else: - karma = "-1" - reputation.append(cleanString(karma)) - - # Getting here another good tag to find the post date, post content and users' signature - - postarea = ipost.find('div', {"class": "post_content"}) - - dt = postarea.find('span', {"class": "post_date"}).text - # dt = dt.strip().split() - dt = dt.strip() - day=date.today() - if "Yesterday" in dt: - yesterday = day - timedelta(days=1) - yesterday = yesterday.strftime('%m-%d-%Y') - stime = dt.replace('Yesterday,','').strip() - date_time_obj = yesterday+ ', '+stime - date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') - elif "hours ago" in dt: - day = day.strftime('%m-%d-%Y') - date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] - date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p') - else: - date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') - stime = date_time_obj.strftime('%b %d, %Y') - sdate = date_time_obj.strftime('%I:%M %p') - addDate.append(date_time_obj) - - # Finding the post - - inner = postarea.find('div', {"class": "post_body scaleimages"}) - inner = inner.text.strip() - post.append(cleanString(inner)) - - # Finding the user's signature - - # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) - signature = ipost.find('div', {"class": "signature scaleimages"}) - if signature != None: - signature = signature.text.strip() - # print(signature) - else: - signature = "-1" - sign.append(cleanString(signature)) - - # As no information about user's feedback was found, just assign "-1" to the variable - + + thread: ResultSet[Tag] = soup.find("div", {"class": "block-body js-replyNewMessageContainer"}).find_all("article", {"data-author": True}) + + for ipost in thread: + username = ipost.find("h4", {"class": "message-name"}).text + user.append(cleanString(username.strip())) + + date_posted = ipost.find("ul", {"class": "message-attribution-main listInline"}).find("time").get("datetime") + datetime_obj = datetime.strptime(date_posted, "%Y-%m-%dT%H:%M:%S%z") + addDate.append(datetime_obj) + + feedback.append("-1") - - ''' - except: - if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ": - user.append("-1") - status.append(-1) - interest.append(-1) - reputation.append(-1) - addDate.append(-1) - post.append("NO ACCESS TO THIS PAGE!") - sign.append(-1) - feedback.append(-1) - ''' - + + user_status = ipost.find("h5", {"class": "userTitle message-userTitle"}).text + status.append(cleanString(user_status.strip())) + + user_lvl = ipost.find("div", {"class": "afAwardLevel"}).text + reputation.append(cleanString(user_lvl.strip())) + + sign.append("-1") + + user_post = ipost.find("article", {"class": "message-body js-selectToQuote"}).text + post.append(cleanString(user_post.strip())) + + interest.append("-1") + + # Populate the final variable (this should be a list with all fields scraped) @@ -178,7 +69,7 @@ def cryptBB_description_parser(soup): # This is the method to parse the Listing Pages (one page with many posts) -def cryptBB_listing_parser(soup): +def procrax_listing_parser(soup: Tag): board = "-1" # board name (the previous level of the topic in the Forum categorization tree. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) @@ -193,59 +84,47 @@ def cryptBB_listing_parser(soup): # Listing and Description pages) # Finding the board (should be just one) - - board = soup.find('span', {"class": "active"}).text - board = cleanString(board.strip()) - - # Finding the repeated tag that corresponds to the listing of topics - - itopics = soup.find_all('tr',{"class": "inline_row"}) - - for itopic in itopics: - - # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them - # to don't miss any topic - - # Adding the topic to the topic list - try: - topics = itopic.find('span', {"class": "subject_old"}).find('a').text - except: - topics = itopic.find('span', {"class": "subject_new"}).find('a').text - topics = re.sub("\[\w*\]", '', topics) - topic.append(cleanString(topics)) - - # Counting how many topics we have found so far - - nm = len(topic) - - # Adding the url to the list of urls - try: - link = itopic.find('span', {"class": "subject_old"}).find('a').get('href') - except: - link = itopic.find('span',{"class": "subject_new"}).find('a').get('href') - link = cleanLink(link) - href.append(link) - - # Finding the author of the topic - ps = itopic.find('div', {"class":"author smalltext"}).find('a').text - user = ps.strip() - author.append(cleanString(user)) - - # Finding the number of replies - columns = itopic.findChildren('td',recursive=False) - replies = columns[3].text - - posts.append(cleanString(replies)) - - # Finding the number of Views - tview = columns[4].text - views.append(cleanString(tview)) - - # If no information about when the topic was added, just assign "-1" to the variable - - addDate.append("-1") - - return organizeTopics("CryptBB", nm, topic, board, author, views, posts, href, addDate) + li = soup.find("h1", {"class": "p-title-value"}) + board = cleanString(li.text.strip()) + + threads_list: ResultSet[Tag] = soup.find("div", {"class": "structItemContainer-group js-threadList"}).find_all("div", {"data-author": True}) + + nm = len(threads_list) + + for thread in threads_list: + thread_title = thread.find("div", {"class": "structItem-title"}).text + topic.append(cleanString(thread_title.strip())) + + thread_author = thread.get("data-author") + author.append(cleanString(thread_author)) + + thread_views = thread.find("dl", {"class": "pairs pairs--justified structItem-minor"}).find('dd').text + views.append(cleanString(thread_views.strip())) + + thread_replies = thread.find("dl", {"class": "pairs pairs--justified"}).find('dd').text + # All threads contain one topic post and reply posts + thread_total_posts = str(1 + int(thread_replies)) + posts.append(thread_total_posts) + + thread_date = thread.find("li", {"class": "structItem-startDate"}).find("time").get("datetime") + datetime_obj = datetime.strptime(thread_date, "%Y-%m-%dT%H:%M:%S%z") + addDate.append(datetime_obj) + + thread_link = thread.find("div", {"class": "structItem-title"}).find('a').get('href') + href.append(thread_link) + + + return organizeTopics( + forum="Procrax", + nm=nm, + board=board, + author=author, + topic=topic, + views=views, + posts=posts, + addDate=addDate, + href=href + ) def procrax_links_parser(soup): diff --git a/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-310.pyc b/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index dc895d5..0000000 Binary files a/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc b/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index f88c80b..0000000 Binary files a/MarketPlaces/AnonymousMarketplace/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-310.pyc b/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 1dde171..0000000 Binary files a/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc b/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index 680753a..0000000 Binary files a/MarketPlaces/AnonymousMarketplace/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc b/MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc index 95b0bbf..bb4ff6e 100644 Binary files a/MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc and b/MarketPlaces/DB_Connection/__pycache__/db_connection.cpython-311.pyc differ diff --git a/MarketPlaces/Initialization/__pycache__/__init__.cpython-310.pyc b/MarketPlaces/Initialization/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 495b447..0000000 Binary files a/MarketPlaces/Initialization/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/Initialization/__pycache__/__init__.cpython-311.pyc b/MarketPlaces/Initialization/__pycache__/__init__.cpython-311.pyc deleted file mode 100644 index 1309640..0000000 Binary files a/MarketPlaces/Initialization/__pycache__/__init__.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/Initialization/__pycache__/markets_mining.cpython-310.pyc b/MarketPlaces/Initialization/__pycache__/markets_mining.cpython-310.pyc deleted file mode 100644 index 5189e47..0000000 Binary files a/MarketPlaces/Initialization/__pycache__/markets_mining.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/Initialization/__pycache__/markets_mining.cpython-311.pyc b/MarketPlaces/Initialization/__pycache__/markets_mining.cpython-311.pyc deleted file mode 100644 index d2bbff8..0000000 Binary files a/MarketPlaces/Initialization/__pycache__/markets_mining.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-310.pyc b/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-310.pyc deleted file mode 100644 index 2da4f71..0000000 Binary files a/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc b/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc deleted file mode 100644 index 0e40335..0000000 Binary files a/MarketPlaces/Initialization/__pycache__/prepare_parser.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/Initialization/geckodriver.log b/MarketPlaces/Initialization/geckodriver.log index 7998782..1257729 100644 --- a/MarketPlaces/Initialization/geckodriver.log +++ b/MarketPlaces/Initialization/geckodriver.log @@ -15617,3 +15617,73 @@ unwatchForTargets()@TargetList.jsm:37 destructor()@TargetList.jsm:109 stop()@CDP.jsm:104 close()@RemoteAgent.jsm:138 +1689619116242 geckodriver INFO Listening on 127.0.0.1:57366 +1689619118954 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "57367" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofile0Dg5aD" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689619119382 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:57367/devtools/browser/26c42825-1d86-4c6a-ad3b-817e084e0b36 +1689619120284 Marionette INFO Listening on port 57373 +1689619120428 RemoteAgent WARN TLS certificate errors will be ignored for this session +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofile0Dg5aD\thumbnails) because it does not exist +1689619308722 Marionette INFO Stopped listening on port 57373 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +JavaScript error: chrome://remote/content/marionette/cert.js, line 55: NS_ERROR_NOT_AVAILABLE: Component returned failure code: 0x80040111 (NS_ERROR_NOT_AVAILABLE) [nsICertOverrideService.setDisableAllSecurityChecksAndLetAttackersInterceptMyData] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofile0Dg5aD\thumbnails) because it does not exist +Crash Annotation GraphicsCriticalError: |[C0][GFX1-]: Receive IPC close with reason=AbnormalShutdown (t=1960.99) +###!!! [Child][MessageChannel] Error: (msgtype=0x3900E5,name=PContent::Msg_GraphicsError) Channel closing: too late to send/recv, messages will be lost + +[GFX1-]: Receive IPC close with reason=AbnormalShutdown +1689619309292 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 +1689619356214 geckodriver INFO Listening on 127.0.0.1:57526 +1689619360407 mozrunner::runner INFO Running command: "C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe" "--marionette" "--remote-debugging-port" "57527" "--remote-allow-hosts" "localhost" "-no-remote" "-profile" "C:\\Users\\MINHKH~1\\AppData\\Local\\Temp\\rust_mozprofileUEfwdk" +console.log: "TorSettings: loadFromPrefs()" +console.log: "TorConnect: init()" +console.log: "TorConnect: Entering Initial state" +console.log: "TorConnect: Observed profile-after-change" +console.log: "TorConnect: Observing topic 'TorProcessExited'" +console.log: "TorConnect: Observing topic 'TorLogHasWarnOrErr'" +console.log: "TorConnect: Observing topic 'torsettings:ready'" +console.log: "TorSettings: Observed profile-after-change" +1689619360903 Marionette INFO Marionette enabled +console.log: "TorConnect: Will load after bootstrap => [about:blank]" +console.error: "Could not load engine blockchair-onion@search.mozilla.org: Error: Extension is invalid" +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XULStore.jsm, line 66: Error: Can't find profile directory. +JavaScript error: resource://gre/modules/XPCOMUtils.jsm, line 161: TypeError: Cc[aContract] is undefined +DevTools listening on ws://localhost:57527/devtools/browser/85530b1c-e7e2-4313-8c36-704d0f5ce7da +1689619362005 Marionette INFO Listening on port 57534 +1689619362321 RemoteAgent WARN TLS certificate errors will be ignored for this session +1689619608554 Marionette INFO Stopped listening on port 57534 +JavaScript error: resource:///modules/Interactions.jsm, line 209: NS_ERROR_FAILURE: Component returned failure code: 0x80004005 (NS_ERROR_FAILURE) [nsIUserIdleService.removeIdleObserver] +!!! error running onStopped callback: TypeError: callback is not a function +JavaScript error: resource:///modules/sessionstore/SessionFile.jsm, line 375: Error: _initWorker called too early! Please read the session file from disk first. +JavaScript error: resource://gre/modules/PromiseWorker.jsm, line 106: Error: Could not get children of file(C:\Users\minhkhoitran\AppData\Local\Temp\rust_mozprofileUEfwdk\thumbnails) because it does not exist +1689619609120 RemoteAgent ERROR unable to stop listener: [Exception... "Component returned failure code: 0x8000ffff (NS_ERROR_UNEXPECTED) [nsIWindowMediator.getEnumerator]" nsresult: "0x8000ffff (NS_ERROR_UNEXPECTED)" location: "JS frame :: chrome://remote/content/cdp/observers/TargetObserver.jsm :: stop :: line 64" data: no] Stack trace: stop()@TargetObserver.jsm:64 +unwatchForTabs()@TargetList.jsm:70 +unwatchForTargets()@TargetList.jsm:37 +destructor()@TargetList.jsm:109 +stop()@CDP.jsm:104 +close()@RemoteAgent.jsm:138 diff --git a/MarketPlaces/Initialization/marketsList.txt b/MarketPlaces/Initialization/marketsList.txt index 2cf7709..032ecf3 100644 --- a/MarketPlaces/Initialization/marketsList.txt +++ b/MarketPlaces/Initialization/marketsList.txt @@ -1 +1 @@ -AnonymousMarketplace \ No newline at end of file +M00nkeyMarket \ No newline at end of file diff --git a/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-310.pyc b/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index e8ecfba..0000000 Binary files a/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc b/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index a940cdc..0000000 Binary files a/MarketPlaces/M00nkeyMarket/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-310.pyc b/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 7ede7a6..0000000 Binary files a/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc b/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index af3e012..0000000 Binary files a/MarketPlaces/M00nkeyMarket/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/M00nkeyMarket/crawler_selenium.py b/MarketPlaces/M00nkeyMarket/crawler_selenium.py index 2f651a5..2161244 100644 --- a/MarketPlaces/M00nkeyMarket/crawler_selenium.py +++ b/MarketPlaces/M00nkeyMarket/crawler_selenium.py @@ -27,25 +27,24 @@ from MarketPlaces.M00nkeyMarket.parser import m00nkey_links_parser from MarketPlaces.Utilities.utilities import cleanHTML counter = 1 -baseURL = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/' - +BASE_URL = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/' +MARKET_NAME = 'M00nkeyMarket' # Opens Tor Browser, crawls the website, then parses, then closes tor #acts like the main method for the crawler, another function at the end of this code calls this function later def startCrawling(): - # opentor() - mktName = getMKTName() - # driver = getAccess() - # - # if driver != 'down': - # try: - # login(driver) - # crawlForum(driver) - # except Exception as e: - # print(driver.current_url, e) - # closetor(driver) + opentor() + driver = getAccess() + + if driver != 'down': + try: + login(driver) + crawlForum(driver) + except Exception as e: + print(driver.current_url, e) + closetor(driver) - new_parse(mktName, baseURL, False) + new_parse(MARKET_NAME, BASE_URL, False) # Opens Tor Browser @@ -64,16 +63,16 @@ def opentor(): # Returns the name of the website #return: name of site in string type -def getMKTName(): - name = 'M00nkeyMarket' - return name +# def getMKTName(): +# name = 'M00nkeyMarket' +# return name # Return the base link of the website #return: url of base site in string type -def getFixedURL(): - url = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/' - return url +# def getFixedURL(): +# url = 'http://moonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wyd.onion/' +# return url # Closes Tor Browser @@ -127,10 +126,9 @@ def createFFDriver(): #the driver 'gets' the url, attempting to get on the site, if it can't access return 'down' #return: return the selenium driver or string 'down' def getAccess(): - url = getFixedURL() driver = createFFDriver() try: - driver.get(url) + driver.get(BASE_URL) return driver except: driver.close() @@ -175,7 +173,7 @@ def savePage(page, url): def getFullPathName(url): from MarketPlaces.Initialization.markets_mining import config, CURRENT_DATE - mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + getMKTName() + "/HTML_Pages") + mainDir = os.path.join(config.get('Project', 'shared_folder'), "MarketPlaces/" + MARKET_NAME + "/HTML_Pages") fileName = getNameFromURL(url) if isDescriptionLink(url): fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html') @@ -237,7 +235,7 @@ def crawlForum(driver): while has_next_page: list = productPages(html) for item in list: - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(BASE_URL, str(item)) try: driver.get(itemURL) except: diff --git a/MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-310.pyc b/MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index 4bb3771..0000000 Binary files a/MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-311.pyc b/MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index 46ed74e..0000000 Binary files a/MarketPlaces/Tor2door/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/Tor2door/__pycache__/parser.cpython-310.pyc b/MarketPlaces/Tor2door/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 5b284b2..0000000 Binary files a/MarketPlaces/Tor2door/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/Tor2door/__pycache__/parser.cpython-311.pyc b/MarketPlaces/Tor2door/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index a6b28ec..0000000 Binary files a/MarketPlaces/Tor2door/__pycache__/parser.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-310.pyc b/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-310.pyc deleted file mode 100644 index d72e9e9..0000000 Binary files a/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc b/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc deleted file mode 100644 index ea57780..0000000 Binary files a/MarketPlaces/TorBay/__pycache__/crawler_selenium.cpython-311.pyc and /dev/null differ diff --git a/MarketPlaces/TorBay/__pycache__/parser.cpython-310.pyc b/MarketPlaces/TorBay/__pycache__/parser.cpython-310.pyc deleted file mode 100644 index 2ff9034..0000000 Binary files a/MarketPlaces/TorBay/__pycache__/parser.cpython-310.pyc and /dev/null differ diff --git a/MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc b/MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc deleted file mode 100644 index b8c67cf..0000000 Binary files a/MarketPlaces/TorBay/__pycache__/parser.cpython-311.pyc and /dev/null differ