procrax, cardingleaks - craweler completed

altenens - debugging crawler torbay - in progress parser
2 years ago · 86bdddc30e
--- a/Forums/Altenens/crawler_selenium.py
+++ b/Forums/Altenens/crawler_selenium.py
@ -60,24 +60,24 @@ def opentor():

 # Login using premade account credentials and do login captcha manually
 def login(driver):
    #click login button
    login = driver.find_element(by=By.XPATH, value='//*[@id="top"]/div[1]/div/div/div/div[1]/a[1]')
    login.click()

    #entering username and password into input boxes
    usernameBox = driver.find_element(by=By.XPATH, value='//*[@id="_xfUid-1-1688066635"]')
    #Username here
    usernameBox.send_keys('mylittlepony45')#sends string to the username box
    passwordBox = driver.find_element(by=By.XPATH, value='//*[@id="_xfUid-2-1688066635"]')
    #Password here
    passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox
    # #click login button
    # login_link = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]').get_attribute('href')
    # driver.get(login_link)  # open tab with url
    #
    # #entering username and password into input boxes
    # usernameBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[1]/dd/input')
    # #Username here
    # usernameBox.send_keys('mylittlepony45')#sends string to the username box
    # passwordBox = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/form/div[1]/div/dl[2]/dd/div/div/input')
    # #Password here
    # passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox

    input("Press ENTER when CAPTCHA is completed\n")

    # wait for listing page show up (This Xpath may need to change based on different seed url)
    # wait for 50 sec until id = tab_content is found, then cont
    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
        (By.XPATH, '///html/body/div[1]/div[4]/div/div/div[3]/div/div/div[4]/div/div/div[1]/div/div[1]/div[2]/ol/li[1]span>')))
    # WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
    #     (By.XPATH, '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div[4]/div/div/div[1]/div/div[1]/div[2]/ol/li[1]/a')))


 # Returns the name of the website
@ -241,9 +241,9 @@ def crawlForum(driver):
                        savePage(driver.page_source, item)

                        # if there is a next page then go and save....
                        # next page in the topic?
                        # specific
                        try:
                            item = driver.find_element(By.XPATH, '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div[1]/div[1]/div[1]/nav/div[1]/a').get_attribute('href')
                            item = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')

                            if item == "":
                                raise NoSuchElementException
@ -256,8 +256,8 @@ def crawlForum(driver):
                    #end of loop
                    for i in range(counter):
                        driver.back()
                    # comment out
                    break
                    # # comment out
                    # break

                # comment out
                if count == 1:
@ -265,8 +265,7 @@ def crawlForum(driver):
                   break

                try:# change depending on web page, #next page
                    link = driver.find_element(by=By.XPATH, value = '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div/div[1]/div[1]/nav/div[1]/a').get_attribute('href')

                    link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    try:
--- a/Forums/Initialization/forums_mining.py
+++ b/Forums/Initialization/forums_mining.py
@ -1,7 +1,7 @@
 __author__ = 'DarkWeb'

 '''
 Starting point of the Darkweb Mining Platform

 '''

 import os
@ -10,8 +10,10 @@ from Forums.BestCardingWorld.crawler_selenium import crawler as crawlerBestCardi
 from Forums.CryptBB.crawler_selenium import crawler as crawlerCryptBB
 from Forums.OnniForums.crawler_selenium import crawler as crawlerOnniForums
 from Forums.AbyssForum.crawler_selenium import crawler as crawlerAbyssForum
 from Forums.Altenens.crawler_selenium import crawler as crawlerAltenensForum
 from Forums.Procrax.crawler_selenium import crawler as crawlerProcraxForum
 from Forums.HiddenAnswers.crawler_selenium import crawler as crawlerHiddenAnswers
 from Forums.Cardingleaks.crawler_selenium import crawler as crawlerCardingleaks
 from Forums.Altenens.crawler_selenium import crawler as crawlerAltenens

 import configparser
 import time
@ -113,9 +115,12 @@ if __name__ == '__main__':
            crawlerHiddenAnswers()
        elif forum == "Altenens":
            crawlerAltenensForum()

    
    
        elif forum == 'Procrax':
            crawlerProcraxForum()
        elif forum == 'Cardingleaks':
            crawlerCardingleaks()
        elif forum == 'Altenens':
            crawlerAltenens()



--- a/Forums/Initialization/geckodriver.log
+++ b/Forums/Initialization/geckodriver.log
--- a/Forums/OnniForums/crawler_selenium.py
+++ b/Forums/OnniForums/crawler_selenium.py
@ -31,19 +31,19 @@ baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion

 # Opens Tor Browser, crawls the website
 def startCrawling():
    # opentor()
    opentor()
    forumName = getForumName()
    # driver = getAccess()
    driver = getAccess()

    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)
    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)

    new_parse(forum=forumName, url=baseURL, createLog=False)
    # new_parse(forum=forumName, url=baseURL, createLog=False)


 # Opens Tor Browser
@ -190,9 +190,9 @@ def getInterestedLinks():
    links = []

    # # Hacking & Cracking tutorials
    # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials')
    links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-tutorials')
    # Hacking & Cracking questions
    links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions')
    # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Hacking-Cracking-questions')
    # # Exploit PoCs
    # links.append('http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion/Forum-Exploit-PoCs')
    # # Cracked software
@ -280,12 +280,12 @@ def crawlForum(driver):
                        driver.back()

                    # comment out, one topic per page
                    # break
                    break

                # comment out, go through all pages
                # if count == 1:
                #     count = 0
                #     break
                if count == 1:
                    count = 0
                    break

                try:
                    temp = driver.find_element(by=By.XPATH, value=
--- a/Forums/Procrax/crawler_selenium.py
+++ b/Forums/Procrax/crawler_selenium.py
@ -0,0 +1,319 @@
 __author__ = 'Helium'

 '''
 Procrax Forum Crawler (Selenium)
 '''

 from selenium import webdriver
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
 from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
 from selenium.webdriver.firefox.service import Service
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
 from PIL import Image

 import urllib.parse as urlparse
 import os, re, time
 from datetime import date
 import configparser
 import subprocess
 from bs4 import BeautifulSoup
 from Forums.Initialization.prepare_parser import new_parse
 from Forums.Procrax.parser import procrax_links_parser
 from Forums.Utilities.utilities import cleanHTML

 counter = 1
 baseURL = 'https://procrax.cx/'


 # Opens Tor Browser, crawls the website
 def startCrawling():
    opentor()
    # forumName = getForumName()
    driver = getAccess()

    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)

    # new_parse(forumName, False)


 # Opens Tor Browser
 def opentor():
    from Forums.Initialization.forums_mining import config

    global pid
    print("Connecting Tor...")
    pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
    pid = pro.pid
    time.sleep(7.5)
    input('Tor Connected. Press ENTER to continue\n')
    return


 # Login using premade account credentials and do login captcha manually
 def login(driver):
    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
        (By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span')))
    # #entering username and password into input boxes
    # usernameBox = driver.find_element(by=By.NAME, value='login')
    # #Username here
    # usernameBox.send_keys('cheese_pizza_man')#sends string to the username box
    # passwordBox = driver.find_element(by=By.NAME, value='password')
    # #Password here
    # passwordBox.send_keys('Gr33nSp@m&3ggs')# sends string to passwordBox
    #
    # clicker = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div[2]/div[1]/form/div/div/div/dl[4]/dd/div/div[2]/button/span')
    # clicker.click()
    #
    # # # wait for listing page show up (This Xpath may need to change based on different seed url)
    # # # wait for 50 sec until id = tab_content is found, then cont
    # WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
    #     (By.XPATH, '/html/body/div[1]/div[3]/div[2]/div[3]/div[1]/div/div[1]/div')))


 # Returns the name of the website
 def getForumName():
    name = 'Procrax'
    return name


 # Return the link of the website
 def getFixedURL():
    url = 'https://procrax.cx/'
    return url


 # Closes Tor Browser
 def closetor(driver):
    # global pid
    # os.system("taskkill /pid " + str(pro.pid))
    # os.system("taskkill /t /f /im tor.exe")
    print('Closing Tor...')
    driver.close() #close tab
    time.sleep(3)
    return


 # Creates FireFox 'driver' and configure its 'Profile'
 # to use Tor proxy and socket
 def createFFDriver():
    from Forums.Initialization.forums_mining import config

    ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))

    ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
    ff_prof.set_preference("places.history.enabled", False)
    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
    ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
    ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
    ff_prof.set_preference("signon.rememberSignons", False)
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    ff_prof.set_preference("network.dns.disablePrefetch", True)
    ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("permissions.default.image", 3)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
    ff_prof.set_preference('network.proxy.type', 1)
    ff_prof.set_preference("network.proxy.socks_version", 5)
    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
    ff_prof.set_preference('network.proxy.socks_port', 9150)
    ff_prof.set_preference('network.proxy.socks_remote_dns', True)
    ff_prof.set_preference("javascript.enabled", True)
    ff_prof.update_preferences()

    service = Service(config.get('TOR', 'geckodriver_path'))

    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)

    return driver

 def getAccess():
    url = getFixedURL()
    driver = createFFDriver()
    try:
        driver.get(url)# open url in browser
        return driver
    except:
        driver.close()# close tab
        return 'down'


 # Saves the crawled html page
 def savePage(page, url):
    cleanPage = cleanHTML(page)
    filePath = getFullPathName(url)
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    open(filePath, 'wb').write(cleanPage.encode('utf-8'))
    return


 # Gets the full path of the page to be saved along with its appropriate file name
 def getFullPathName(url):
    from Forums.Initialization.forums_mining import config, CURRENT_DATE

    mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
    fileName = getNameFromURL(url)
    if isDescriptionLink(url):
        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
    else:
        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
    return fullPath


 # Creates the file name from passed URL
 def getNameFromURL(url):
    global counter
    name = ''.join(e for e in url if e.isalnum())
    if (name == ''):
        name = str(counter)
        counter = counter + 1
    return name


 def getInterestedLinks():
    links = []

    # # general hacking
    # links.append('https://procrax.cx/forums/general-hacking.24/')
    # # hacking security tools
    # links.append('https://procrax.cx/forums/hacking-security-tools.20/')
    # # hacktube
    # links.append('https://procrax.cx/forums/hacktube.22/')
    # # cardable
    #  links.append('https://procrax.cx/forums/cardable-websites.28/')
    # #  tools
    # links.append('https://procrax.cx/forums/tools-bots-validators.73/')
    # general forum
    links.append('https://procrax.cx/forums/forum-discussions-updates.7/')


    return links


 def crawlForum(driver):
    print("Crawling the Procrax forum")

    linksToCrawl = getInterestedLinks()
    visited = set(linksToCrawl)
    initialTime = time.time()

    i = 0
    count = 0
    while i < len(linksToCrawl):
        link = linksToCrawl[i]
        print('Crawling :', link)
        try:
            try:
                driver.get(link)# open
            except:
                driver.refresh()
            html = driver.page_source
            savePage(html, link)

            has_next_page = True

            #loop through the topics
            while has_next_page:
                list = topicPages(html)# for multiple pages
                for item in list:
                    #variable to check if there is a next page for the topic
                    has_next_topic_page = True
                    counter = 1

                    # check if there is a next page for the topics
                    while has_next_topic_page:
                        # try to access next page of th topic
                        itemURL = urlparse.urljoin(baseURL, str(item))
                        try:
                            driver.get(itemURL)
                        except:
                            driver.refresh()
                        savePage(driver.page_source, item)

                        # if there is a next page then go and save....
                        # specific
                        try:
                            # temp = driver.find_element(By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div/div')
                            item = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')

                            if item == "":
                                raise NoSuchElementException
                                has_next_topic_page = False
                            else:
                                counter += 1
                        except NoSuchElementException:
                            has_next_topic_page = False

                    #end of loop
                    for i in range(counter):
                        driver.back()
                #     # comment out
                #     break
                #
                # # comment out
                # if count == 1:
                #    count = 0
                #    break

                try:# change depending on web page, #general
                    # /html/body/div[1]/div[3]/div[2]/div[3]/div/div/div/div[1]/div/nav/div[1]
                    # temp = driver.find_element(By.XPATH, value='/html/body/div[1]/div[3]/div[2]/div[3]/div/div/div/div[1]/div/nav/div[1]')
                    link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')

                    if link == "":
                        raise NoSuchElementException
                    try:
                        driver.get(link)
                    except:
                        driver.refresh()
                    html = driver.page_source
                    savePage(html, link)
                    count += 1

                except NoSuchElementException:
                    has_next_page = False

        except Exception as e:
            print(link, e)
        i += 1

    # finalTime = time.time()
    # print finalTime - initialTime

    input("Crawling Procrax forum done successfully. Press ENTER to continue\n")


 # Returns 'True' if the link is Topic link, may need to change for every website
 def isDescriptionLink(url):
    if 'threads' in url:
        return True
    return False


 # Returns True if the link is a listingPage link, may need to change for every website
 def isListingLink(url):
    if 'forums' in url:
        return True
    return False


 # calling the parser to define the links
 def topicPages(html):
    soup = BeautifulSoup(html, "html.parser")
    #print(soup.find('div', id="container").find('div', id="content").find('table', {"class": "tborder clear"}).find('tbody').find('tr',{"class": "inline_row"}).find('strong').text)
    return procrax_links_parser(soup)


 def crawler():
    startCrawling()
    # print("Crawling and Parsing BestCardingWorld .... DONE!")
--- a/Forums/Procrax/parser.py
+++ b/Forums/Procrax/parser.py
@ -0,0 +1,264 @@
 __author__ = 'Helium'

 # Here, we are importing the auxiliary functions to clean or convert data
 from Forums.Utilities.utilities import *
 from datetime import date
 from datetime import timedelta
 import re

 # Here, we are importing BeautifulSoup to search through the HTML tree
 from bs4 import BeautifulSoup

 # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)

 def cryptBB_description_parser(soup):

    # Fields to be parsed

    topic = "-1"           # topic name
    user = []              # all users of each post
    addDate = []           # all dated of each post
    feedback = []          # all feedbacks of each vendor (this was found in just one Forum and with a number format)
    status = []            # all user's authority in each post such as (adm, member, dangerous)
    reputation = []        # all user's karma in each post (usually found as a number)
    sign = []              # all user's signature in each post (usually a standard message after the content of the post)
    post = []              # all messages of each post
    interest = []          # all user's interest in each post

    # Finding the topic (should be just one coming from the Listing Page)

    li = soup.find("td", {"class": "thead"}).find('strong')
    topic = li.text
    topic = re.sub("\[\w*\]", '', topic)

    topic = topic.replace(",","")
    topic = topic.replace("\n","")
    topic = cleanString(topic.strip())

    # Finding the repeated tag that corresponds to the listing of posts

    # try:
    posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
        'div', {"class": "post"})

    # For each message (post), get all the fields we are interested to:

    for ipost in posts:

        # Finding a first level of the HTML page

        post_wrapper = ipost.find('span', {"class": "largetext"})

        # Finding the author (user) of the post

        author = post_wrapper.text.strip()
        user.append(cleanString(author))  # Remember to clean the problematic characters

        # Finding the status of the author

        smalltext = ipost.find('div', {"class": "post_author"})

        '''
        # Testing here two possibilities to find this status and combine them
        if ipost.find('div', {"class": "deleted_post_author"}):
            status.append(-1)
            interest.append(-1)
            reputation.append(-1)
            addDate.append(-1)
            post.append("THIS POST HAS BEEN REMOVED!")
            sign.append(-1)
            feedback.append(-1)
            continue
        '''

        # CryptBB does have membergroup and postgroup

        membergroup = smalltext.find('div', {"class": "profile-rank"})
        postgroup = smalltext.find('div', {"class": "postgroup"})
        if membergroup != None:
            membergroup = membergroup.text.strip()
            if postgroup != None:
                postgroup = postgroup.text.strip()
                membergroup = membergroup + " - " + postgroup
        else:
            if postgroup != None:
                membergroup = postgroup.text.strip()
            else:
                membergroup = "-1"
        status.append(cleanString(membergroup))

        # Finding the interest of the author
        # CryptBB does not have blurb
        blurb = smalltext.find('li', {"class": "blurb"})
        if blurb != None:
            blurb = blurb.text.strip()
        else:
            blurb = "-1"
        interest.append(cleanString(blurb))

        # Finding the reputation of the user
        # CryptBB does have reputation
        author_stats = smalltext.find('div', {"class": "author_statistics"})
        karma = author_stats.find('strong')
        if karma != None:
            karma = karma.text
            karma = karma.replace("Community Rating: ", "")
            karma = karma.replace("Karma: ", "")
            karma = karma.strip()
        else:
            karma = "-1"
        reputation.append(cleanString(karma))

        # Getting here another good tag to find the post date, post content and users' signature

        postarea = ipost.find('div', {"class": "post_content"})

        dt = postarea.find('span', {"class": "post_date"}).text
        # dt = dt.strip().split()
        dt = dt.strip()
        day=date.today()
        if "Yesterday" in dt:
            yesterday = day - timedelta(days=1)
            yesterday = yesterday.strftime('%m-%d-%Y')
            stime = dt.replace('Yesterday,','').strip()
            date_time_obj = yesterday+ ', '+stime
            date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
        elif "hours ago" in dt:
            day = day.strftime('%m-%d-%Y')
            date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
            date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
        else:
            date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
            stime = date_time_obj.strftime('%b %d, %Y')
            sdate = date_time_obj.strftime('%I:%M %p')
        addDate.append(date_time_obj)

        # Finding the post

        inner = postarea.find('div', {"class": "post_body scaleimages"})
        inner = inner.text.strip()
        post.append(cleanString(inner))

        # Finding the user's signature

        # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
        signature = ipost.find('div', {"class": "signature scaleimages"})
        if signature != None:
            signature = signature.text.strip()
            # print(signature)
        else:
            signature = "-1"
        sign.append(cleanString(signature))

        # As no information about user's feedback was found, just assign "-1" to the variable

        feedback.append("-1")

    '''
    except:
        if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ":
            user.append("-1")
            status.append(-1)
            interest.append(-1)
            reputation.append(-1)
            addDate.append(-1)
            post.append("NO ACCESS TO THIS PAGE!")
            sign.append(-1)
            feedback.append(-1)
    '''


    # Populate the final variable (this should be a list with all fields scraped)

    row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)

    # Sending the results

    return row

 # This is the method to parse the Listing Pages (one page with many posts)

 def cryptBB_listing_parser(soup):

    board = "-1"       # board name (the previous level of the topic in the Forum categorization tree.
                       # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)

    nm = 0             # this variable should receive the number of topics
    topic = []         # all topics
    author = []        # all authors of each topic
    views = []         # number of views of each topic
    posts = []         # number of posts of each topic
    addDate = []       # when the topic was created (difficult to find)
    href = []          # this variable should receive all cleaned urls (we will use this to do the marge between
                       # Listing and Description pages)

    # Finding the board (should be just one)

    board = soup.find('span', {"class": "active"}).text
    board = cleanString(board.strip())

    # Finding the repeated tag that corresponds to the listing of topics

    itopics = soup.find_all('tr',{"class": "inline_row"})

    for itopic in itopics:

        # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
        # to don't miss any topic

        # Adding the topic to the topic list
        try:
            topics = itopic.find('span', {"class": "subject_old"}).find('a').text
        except:
            topics = itopic.find('span', {"class": "subject_new"}).find('a').text
        topics = re.sub("\[\w*\]", '', topics)
        topic.append(cleanString(topics))

        # Counting how many topics we have found so far

        nm = len(topic)

        # Adding the url to the list of urls
        try:
            link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
        except:
            link = itopic.find('span',{"class": "subject_new"}).find('a').get('href')
        link = cleanLink(link)
        href.append(link)

        # Finding the author of the topic
        ps = itopic.find('div', {"class":"author smalltext"}).find('a').text
        user = ps.strip()
        author.append(cleanString(user))

        # Finding the number of replies
        columns = itopic.findChildren('td',recursive=False)
        replies = columns[3].text

        posts.append(cleanString(replies))

        # Finding the number of Views
        tview = columns[4].text
        views.append(cleanString(tview))

        # If no information about when the topic was added, just assign "-1" to the variable

        addDate.append("-1")

    return organizeTopics("CryptBB", nm, topic, board, author, views, posts, href, addDate)


 def procrax_links_parser(soup):

    # Returning all links that should be visited by the Crawler

    href = []

    listing = soup.find_all('div', {"class": "structItem-title"})

    for a in listing:
        link = a.find('a', {'class': ''}).get('href')

        href.append(link)

    return href
--- a/MarketPlaces/Initialization/geckodriver.log
+++ b/MarketPlaces/Initialization/geckodriver.log
--- a/MarketPlaces/M00nkeyMarket/HTML_Pages/06272023/Description/listings3102.html
+++ b/MarketPlaces/M00nkeyMarket/HTML_Pages/06272023/Description/listings3102.html
--- a/MarketPlaces/M00nkeyMarket/HTML_Pages/06272023/Listing/httpmoonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wydonionsearchsubcategoriessubcategory30.html
+++ b/MarketPlaces/M00nkeyMarket/HTML_Pages/06272023/Listing/httpmoonkey4f2mkcp6hpackeea356puiry27h3dz3hzbt3adbmsk4gs7wydonionsearchsubcategoriessubcategory30.html
--- a/MarketPlaces/M00nkeyMarket/crawler_selenium.py
+++ b/MarketPlaces/M00nkeyMarket/crawler_selenium.py
@ -2,7 +2,8 @@ __author__ = 'Helium'

 '''
 M00nkeyMarket Forum Crawler (Selenium) incomplete
 having trouble checking it due to the captcha

 might be impossible to crawl
 '''

 from selenium import webdriver
@ -154,11 +155,11 @@ def login(driver):
    # Password here
    passwordBox.send_keys('genie_show_metheWorld')

    input("Press ENTER when CAPTCHA and anti-phishing is completed\n")
    input("Press ENTER when CAPTCHA and exit pressed is completed\n")

    # wait for listing page show up (This Xpath may need to change based on different seed url)
    WebDriverWait(driver, 100).until(EC.visibility_of_element_located(
        (By.XPATH, "/html/body/div/div[1]/div/div/div[2]/div[3]/div/ul/li[2]/ul/li[5]/a")))
        (By.XPATH, "/html/body/div/div[1]/div/div/div[2]/div[3]/div")))

 # Saves the crawled html page, makes the directory path for html pages if not made
 def savePage(page, url):
@ -248,13 +249,15 @@ def crawlForum(driver):
                    break

                # comment out
                if count == 1:
                    count = 0
                    break
                # if count == 1:
                #     count = 0
                #     break

                try:
                    temp = driver.find_element(by=By.XPATH, value='/html/body/div/div[1]/div/div/div[3]/div/div[3]/nav')
                    link = temp.find_element(by=By.CLASS_NAME, value='next ml-1 mt-1').get_attribute('href')
                    temp = driver.find_element(by=By.CLASS_NAME, value='col-lg-12 flex-fill ml-auto text-right mb-1')
                    temp2 = temp.find_element(by=By.CLASS_NAME, value="next ml-1 mt-1")
                    link = temp2.find_element(By.TAG_NAME, value='a').get_attribute('href')

                    if link == "":
                        raise NoSuchElementException
                    try:
--- a/MarketPlaces/M00nkeyMarket/parser.py
+++ b/MarketPlaces/M00nkeyMarket/parser.py
@ -284,7 +284,7 @@ def m00nkey_links_parser(soup):
    listing = soup.findAll('h5', {"class": "card-title rounded text-truncate"})

    for a in listing:
        bae = a.find('a', href=True)
        bae = a.find('a', href=True)#card-title rounded text-truncate
        link = bae['href']
        href.append(link)

--- a/MarketPlaces/ThiefWorld/crawler_selenium.py
+++ b/MarketPlaces/ThiefWorld/crawler_selenium.py
@ -31,19 +31,19 @@ baseURL = 'http://qsw7iurcrdwyml5kg4oxbmtqrcnpxiag3iumdarefzeunnyc2dnyljad.onion
 # Opens Tor Browser, crawls the website, then parses, then closes tor
 #acts like the main method for the crawler, another function at the end of this code calls this function later
 def startCrawling():
    # opentor()
    opentor()
    mktName = getMKTName()
    # driver = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)

    new_parse(mktName, baseURL, False)
    driver = getAccess()

    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)

    # new_parse(mktName, baseURL, False)


 # Opens Tor Browser
--- a/MarketPlaces/TorBay/crawler_selenium.py
+++ b/MarketPlaces/TorBay/crawler_selenium.py
@ -64,7 +64,7 @@ def opentor():
 # Returns the name of the website
 #return: name of site in string type
 def getMKTName():
    name = 'TorBay Market'
    name = 'TorBay'
    return name


@ -226,13 +226,13 @@ def crawlForum(driver):
                    savePage(driver.page_source, item)
                    driver.back()

                    #comment out
                    break

                # # comment out
                if count == 1:
                   count = 0
                   break
                #     #comment out
                #     break
                #
                # # # comment out
                # if count == 1:
                #    count = 0
                #    break

                try:
                    link = driver.find_element(by=By.XPATH, value=
--- a/MarketPlaces/TorBay/parser.py
+++ b/MarketPlaces/TorBay/parser.py
@ -11,7 +11,7 @@ from bs4 import BeautifulSoup
 #stores info it needs in different lists, these lists are returned after being organized
 #@param: soup object looking at html page of description page
 #return: 'row' that contains a variety of lists that each hold info on the description page
 def darkfox_description_parser(soup):
 def torbay_description_parser(soup):

    # Fields to be parsed

@ -40,43 +40,34 @@ def darkfox_description_parser(soup):
    EURO = "-1"                         # 22 Product_EURO_SellingPrice

    # Finding Product Name
    name = soup.find('h1').text
    name = name.replace('\n', ' ')
    name = name.replace(",", "")
    name = name.strip()
    name = soup.find('div', {'class': 'product-information'}).find('h1').text.strip()

    # Finding Vendor
    vendor = soup.find('h3').find('a').text.strip()
    vendor = soup.find('div', {"class": "profile-info"}).find('a').text.strip()

    # Finding Vendor Rating
    rating = soup.find('span', {'class': "tag is-dark"}).text.strip()
    rating.append(-1)

    # Finding Successful Transactions
    success = soup.find('h3').text
    success = success.replace("Vendor: ", "")
    success = success.replace(vendor, "")
    success = success.replace("(", "")
    success = success.replace(")", "")
    success = success.strip()
    success.append(-1)

    bae = soup.find('div', {'class': "box"}).find_all('ul')

    # Finding Prices
    USD = bae[1].find('strong').text.strip()
    USD = soup.find('div', {'class': "total-price"}).find('span').text.strip()

    li = bae[2].find_all('li')

    # Finding Escrow
    # Finding Escrow NEED WORK
    escrow = li[0].find('span', {'class': "tag is-dark"}).text.strip()

    # Finding the Product Category
    category = li[1].find('span', {'class': "tag is-dark"}).text.strip()
    category = soup.find('div', {'class': "profile-info"}).find('p').find('a').text.strip()

    # Finding the Product Quantity Available
    left = li[3].find('span', {'class': "tag is-dark"}).text.strip()
    left.append(-1)

    # Finding Number Sold
    sold = li[4].find('span', {'class': "tag is-dark"}).text.strip()
    sold.append(-1)

    li = bae[3].find_all('li')

@ -147,11 +138,11 @@ def darkfox_description_parser(soup):
 #stores info it needs in different lists, these lists are returned after being organized
 #@param: soup object looking at html page of listing page
 #return: 'row' that contains a variety of lists that each hold info on the listing page
 def darkfox_listing_parser(soup):
 def torbay_listing_parser(soup):

    # Fields to be parsed
    nm = 0                                    # Total_Products (Should be Integer)
    mktName = "DarkFox"                       # 0 Marketplace_Name
    mktName = "TorBay"                       # 0 Marketplace_Name
    name = []                                 # 1 Product_Name
    CVE = []                                  # 2 Product_CVE_Classification (Common Vulnerabilities and Exposures)
    MS = []                                   # 3 Product_MS_Classification (Microsoft Security)
@ -174,7 +165,7 @@ def darkfox_listing_parser(soup):
    success = []                              # 20 Vendor_Successful_Transactions
    href = []                                 # 23 Product_Links (Urls)

    listing = soup.findAll('div', {"class": "card"})
    listing = soup.findAll('div', {"class": "product-card"})

    # Populating the Number of Products
    nm = len(listing)
--- a/setup.ini
+++ b/setup.ini
@ -1,12 +1,12 @@

 [TOR]
 firefox_binary_path = C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\firefox.exe
 firefox_profile_path = C:\\Users\\minhkhoitran\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Data\\Browser\\profile.default
 geckodriver_path = C:\\NSF-REU\\dw_pipeline_test\\selenium\\geckodriver.exe
 firefox_binary_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\firefox.exe
 firefox_profile_path = C:\\Users\\Helium\\Desktop\\Tor Browser\\Browser\\TorBrowser\\Data\\Browser\\profile.default
 geckodriver_path = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test\\selenium\\geckodriver.exe

 [Project]
 project_directory = C:\\NSF-REU\\dw_pipeline_test
 shared_folder = \\VBoxSvr\\VM_Files_(shared)
 project_directory = C:\\Users\\Helium\\PycharmProjects\\dw_pipeline_test
 shared_folder = \\VBoxSvr\\Shared

 [PostgreSQL]
 ip = localhost