cleaned terminal output

1 year ago · 2022abecc8
--- a/Forums/AbyssForum/crawler_selenium.py
+++ b/Forums/AbyssForum/crawler_selenium.py
@ -30,19 +30,19 @@ baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion

 # Opens Tor Browser, crawls the website
 def startCrawling():
    opentor()
    # forumName = getForumName()
    driver = getAccess()
    # opentor()
    forumName = getForumName()
    # driver = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)

    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)

    # new_parse(forumName, baseURL, False)
    new_parse(forumName, baseURL, True)


 # Opens Tor Browser
@ -260,7 +260,7 @@ def crawlForum(driver):
            print(link, e)
        i += 1

    input("Crawling AbyssForum forum done sucessfully. Press ENTER to continue\n")
    print("Crawling the AbyssForum forum done.")


 # Returns 'True' if the link is Topic link
--- a/Forums/Altenens/crawler_selenium.py
+++ b/Forums/Altenens/crawler_selenium.py
@ -30,19 +30,19 @@ baseURL = 'https://altenens.is/'

 # Opens Tor Browser, crawls the website
 def startCrawling():
    opentor()
    # opentor()
    forumName = getForumName()
    driver = getAccess()
    # driver = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)
    
    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)
    
    new_parse(forumName, baseURL, False)
    new_parse(forumName, baseURL, True)


 # Opens Tor Browser
--- a/Forums/Altenens/parser.py
+++ b/Forums/Altenens/parser.py
@ -9,9 +9,8 @@ import re
 # Here, we are importing BeautifulSoup to search through the HTML tree
 from bs4 import BeautifulSoup

 # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)


 # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
 def altenens_description_parser(soup):

    topic = "-1"            # 0 *topic name
@ -58,7 +57,7 @@ def altenens_description_parser(soup):
        if inner is not None:
            inner = inner.strip()
        else:
            inner = "-1"
            inner = ""  # cannot use -1 because the post is hidden unless you reply
        post.append(cleanString(inner))

        feedback.append("-1")
@ -75,8 +74,8 @@ def altenens_description_parser(soup):

    return row

 # This is the method to parse the Listing Pages (one page with many posts)

 # This is the method to parse the Listing Pages (one page with many posts)
 def altenens_listing_parser(soup):

    nm = 0              # *this variable should receive the number of topics
--- a/Forums/BestCardingWorld/crawler_selenium.py
+++ b/Forums/BestCardingWorld/crawler_selenium.py
@ -38,7 +38,7 @@ def startCrawling():
    #         print(driver.current_url, e)
    #     closetor(driver)

    new_parse(forumName, False)
    new_parse(forumName, baseURL, True)


 # Opens Tor Browser
--- a/Forums/Cardingleaks/crawler_selenium.py
+++ b/Forums/Cardingleaks/crawler_selenium.py
@ -4,7 +4,7 @@ __author__ = 'DarkWeb'
 Cardingleaks Forum Crawler (Selenium)
 Crawler updated and fixed

 The site has this thing sometime whereyou'll have to look at a new post everyday. makes sure
 The site has this thing sometime where you'll have to look at a new post everyday. makes sure
 you login first before crawling.
 '''

@ -27,7 +27,7 @@ from Forums.Cardingleaks.parser import cardingleaks_links_parser
 from Forums.Utilities.utilities import cleanHTML

 counter = 1
 baseURL = 'https://cardingleaks.ws/'
 baseURL = 'https://leaks.ws/'


 # Opens Tor Browser, crawls the website
@ -35,7 +35,7 @@ def startCrawling():
    # opentor()
    forumName = getForumName()
    # driver = getAccess()
    
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
@ -44,7 +44,7 @@ def startCrawling():
    #         print(driver.current_url, e)
    #     closetor(driver)

    new_parse(forumName, baseURL, False)
    new_parse(forumName, baseURL, True)


 # Opens Tor Browser
@ -96,7 +96,7 @@ def getForumName() -> str:

 # Return the link of the website
 def getFixedURL():
    url = 'https://cardingleaks.ws/'
    url = 'https://leaks.ws/'
    return url


@ -194,23 +194,23 @@ def getInterestedLinks():
    links = []

    # # carding methods
    links.append('https://cardingleaks.ws/forums/carding-methods.82/')
    links.append('https://leaks.ws/forums/carding-methods.82/')
    # # carding schools
    # links.append('https://cardingleaks.ws/forums/help-desk-carding-school.35/')
    # links.append('https://leaks.ws/forums/help-desk-carding-school.35/')
    # # carding discussion
    # links.append('https://cardingleaks.ws/forums/carding-discussion-desk.58/')
    # links.append('https://leaks.ws/forums/carding-discussion-desk.58/')
    # # carding tutorials
    #  links.append('https://cardingleaks.ws/forums/carding-tutorials.13/')
    #  links.append('https://leaks.ws/forums/carding-tutorials.13/')
    # # carding tools and software
    # links.append('https://cardingleaks.ws/forums/carding-tools-softwares.10/')
    # links.append('https://leaks.ws/forums/carding-tools-softwares.10/')
    # # exploits and cracking tools
    # links.append('https://cardingleaks.ws/forums/exploits-cracking-tools.22/')
    # links.append('https://leaks.ws/forums/exploits-cracking-tools.22/')

    return links


 def crawlForum(driver):
    print("Crawling the Cardinglinks forum")
    print("Crawling the Cardingleaks forum")

    linksToCrawl = getInterestedLinks()

@ -245,7 +245,7 @@ def crawlForum(driver):
                        savePage(driver.page_source, topic + f"page{counter}")  # very important

                        # comment out
                        if counter == 5:
                        if counter == 2:
                            break

                        try:
@ -264,7 +264,7 @@ def crawlForum(driver):
                    break

                # comment out
                if count == 10:
                if count == 1:
                    break

                try:
@ -280,7 +280,7 @@ def crawlForum(driver):
            print(link, e)
        i += 1

    input("Crawling Cardingleaksforum done successfully. Press ENTER to continue\n")
    print("Crawling the Cardingleaks forum done.")


 # Returns 'True' if the link is Topic link, may need to change for every website
--- a/Forums/CryptBB/crawler_selenium.py
+++ b/Forums/CryptBB/crawler_selenium.py
@ -28,19 +28,19 @@ baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion

 # Opens Tor Browser, crawls the website
 def startCrawling():
    opentor()
    # opentor()
    forumName = getForumName()
    driver = getAccess()
    
    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)
    # driver = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)

    new_parse(forumName, baseURL, False)
    new_parse(forumName, baseURL, True)


 # Opens Tor Browser
@ -238,8 +238,6 @@ def getInterestedLinks():
 def crawlForum(driver):
    print("Crawling the CryptBB forum")

    print("Crawling the CryptBB forum")

    linksToCrawl = getInterestedLinks()

    i = 0
@ -273,7 +271,7 @@ def crawlForum(driver):
                        savePage(driver.page_source, topic + f"page{counter}")  # very important

                        # comment out
                        if counter == 10:
                        if counter == 2:
                            break

                        try:
@ -291,10 +289,10 @@ def crawlForum(driver):
                        driver.back()

                    # comment out
                    # break
                    break

                # comment out
                if count == 20:
                if count == 1:
                    break

                try:
@ -312,7 +310,7 @@ def crawlForum(driver):
            print(link, e)
        i += 1

    input("Crawling CrypttBB done successfully. Press ENTER to continue\n")
    print("Crawling the CrypttBB forum done.")


 # Returns 'True' if the link is Topic link, may need to change for every website
--- a/Forums/HiddenAnswers/crawler_selenium.py
+++ b/Forums/HiddenAnswers/crawler_selenium.py
@ -30,19 +30,19 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion

 # Opens Tor Browser, crawls the website
 def startCrawling():
    opentor()
    # forumName = getForumName()
    driver: webdriver.Firefox = getAccess()
    # opentor()
    forumName = getForumName()
    # driver: webdriver.Firefox = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)

    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)

    # new_parse(forumName, baseURL, False)
    new_parse(forumName, baseURL, True)


 # Opens Tor Browser
@ -219,8 +219,8 @@ def crawlForum(driver: webdriver.Firefox):
                        savePage(driver.page_source, topic + f"page{counter}")  # very important

                        # comment out
                        # if counter == 2:
                        #     break
                        if counter == 2:
                            break

                        try:
                            page = ""  # no next page so far may have some later on
@ -235,11 +235,11 @@ def crawlForum(driver: webdriver.Firefox):
                        driver.back()

                    # comment out
                    # break
                    break

                # comment out
                # if count == 1:
                #     break
                if count == 1:
                    break

                try:
                    link = driver.find_element(by=By.CLASS_NAME, value='qa-page-next').get_attribute('href')
@ -255,7 +255,7 @@ def crawlForum(driver: webdriver.Firefox):
            print(link, e)
        i += 1

    input("Crawling HiddenAnswers done successfully. Press ENTER to continue\n")
    print("Crawling the HiddenAnswers forum done.")


 # Returns 'True' if the link is Topic link
--- a/Forums/Initialization/forumsList.txt
+++ b/Forums/Initialization/forumsList.txt
@ -1 +1,9 @@
 HiddenAnswers
 AbyssForum
 Altenens
 BestCardingWorld
 Cardingleaks
 CryptBB
 HiddenAnswers
 Libre
 OnniForums
 Procrax
--- a/Forums/Initialization/forums_mining.py
+++ b/Forums/Initialization/forums_mining.py
@ -4,7 +4,6 @@ __author__ = 'DarkWeb'
 Starting point of the Darkweb Forums Mining
 '''

 import os
 from datetime import *
 from Forums.BestCardingWorld.crawler_selenium import crawler as crawlerBestCardingWorld
 from Forums.CryptBB.crawler_selenium import crawler as crawlerCryptBB
@ -17,7 +16,8 @@ from Forums.Altenens.crawler_selenium import crawler as crawlerAltenens
 from Forums.Libre.crawler_selenium import crawler as crawlerLibre

 import configparser
 import time
 import os
 import subprocess

 config = configparser.ConfigParser()
 config.read('../../setup.ini')
@ -88,9 +88,22 @@ def createSubdirectories(pagesDir):
        os.mkdir(descReadDir)


 # Opens Tor Browser
 def opentor():
    global pid
    print("Connecting Tor...")
    pro = subprocess.Popen(config.get('TOR', 'firefox_binary_path'))
    pid = pro.pid
    # time.sleep(7.5)
    input('Press ENTER when Tor is connected to continue')
    return


 # main method
 if __name__ == '__main__':

    # opentor()

    # assignment from forumsList.txt
    forumsList = getForums()

@ -98,10 +111,10 @@ if __name__ == '__main__':
    for forum in forumsList:
        forum = forum.replace('\n','')

        print("Creating listing and description directories ... for " + forum)
        print("\nCreating listing and description directories ... for " + forum)
        createDirectory(forum)
        time.sleep(5)  # wait for directories to be created
        print("Directories created successfully.")
        # time.sleep(5)  # wait for directories to be created
        print("Directories created.")

        if forum == "BestCardingWorld":
            crawlerBestCardingWorld()
@ -122,7 +135,7 @@ if __name__ == '__main__':
        elif forum == 'Libre':
            crawlerLibre()

    print("Scraping process completed successfully!")
    print("Scraping process completed!")



--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@ -73,13 +73,11 @@ def persist_data(url, row, cur):

    forum = create_forum(cur, row, url)

    board = create_board(cur, row, forum)
    author = create_author(cur, row, forum)

    author = create_user(cur, row, forum, 0)
    topic = create_topic(cur, forum, row, author)

    topic = create_topic(cur, row, forum, board, author)

    create_posts(cur, row, forum, board, topic)
    create_posts(cur, row, forum, topic)


 def incrementError():
@ -191,8 +189,9 @@ def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descript
        if trace.find("already exists") == -1:
            incrementError()
            print(f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!")
            traceback.print_exc()
            if createLog:
                logFile.write(str(nError) + f"There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
                logFile.write(str(nError) + f". There was a problem to persist the files ({listingFile} + {descriptionFile}) in the database!\n")
            return False
        else:
            return True
@ -223,7 +222,7 @@ def new_parse(forum, url, createLog):

    from Forums.Initialization.forums_mining import config, CURRENT_DATE

    print("Parsing The " + forum + " Forum and conduct data classification to store the information in the database.")
    print("Parsing the " + forum + " forum and conduct data classification to store the information in the database.")

    # Connecting to the database
    con = connectDataBase()
@ -261,6 +260,7 @@ def new_parse(forum, url, createLog):
        parseDescriptionError = False
        persistDescriptionError = False
        moveDescriptionError = False
        findDescriptionError = False

        rw = []

@ -272,6 +272,8 @@ def new_parse(forum, url, createLog):

        if doDescription:

            nFound = 0

            for rec in rw:

                rec = rec.split(',')
@ -280,6 +282,9 @@ def new_parse(forum, url, createLog):

                # Reading the associated description Html Pages
                descriptions = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Description", descriptionPattern))

                nFound += len(descriptions)

                for descriptionIndex, descriptionFile in enumerate(descriptions):

                    print("Reading description folder of '" + forum + "', file '" + os.path.basename(
@ -331,7 +336,19 @@ def new_parse(forum, url, createLog):
                    else:
                        moveDescriptionError = True

            if not (readDescriptionError or parseDescriptionError or persistDescriptionError or moveDescriptionError):
            if not (nFound > 0):

                findDescriptionError = True

                incrementError()
                print(f"There was a problem to locate the file(s) for {listingFile} in the Description section!")
                if createLog:
                    logFile.write(
                        str(nError) + f". There was a problem to locate the file(s) for {listingFile}"
                                      f" in the Description section!\n")

            if not (readDescriptionError or parseDescriptionError or persistDescriptionError
                    or moveDescriptionError or findDescriptionError):

                # move listing files of completed folder
                move_file(listingFile, createLog, logFile)
--- a/Forums/Libre/crawler_selenium.py
+++ b/Forums/Libre/crawler_selenium.py
@ -31,7 +31,7 @@ def startCrawling():
    # opentor()
    forumName = getForumName()
    # driver = getAccess()
    
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
@ -40,7 +40,7 @@ def startCrawling():
    #         print(driver.current_url, e)
    #     closetor(driver)

    new_parse(forumName, baseURL, False)
    new_parse(forumName, baseURL, True)


 # Opens Tor Browser
@ -275,7 +275,7 @@ def crawlForum(driver):
            print(link, e)
        i += 1

    input("Crawling Libre done successfully. Press ENTER to continue\n")
    input("Crawling the Libre forum done.")


 # Returns 'True' if the link is Topic link, may need to change for every website
--- a/Forums/Libre/parser.py
+++ b/Forums/Libre/parser.py
@ -9,11 +9,8 @@ import re
 # Here, we are importing BeautifulSoup to search through the HTML tree
 from bs4 import BeautifulSoup, ResultSet, Tag

 # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)




 # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
 def libre_description_parser(soup: Tag):
    # Fields to be parsed

@ -88,7 +85,6 @@ def libre_description_parser(soup: Tag):

        date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
        date_time_cleaned = date_posted.replace(user_name, "")[3:-12]
        print(date_time_cleaned)
        datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT")
        addDate.append(datetime_append)

@ -130,7 +126,6 @@ def libre_description_parser(soup: Tag):


 # This is the method to parse the Listing Pages (one page with many posts)

 def libre_listing_parser(soup):
    nm = 0  # *this variable should receive the number of topics
    forum = "Libre"  # 0 *forum name
@ -217,6 +212,7 @@ def libre_listing_parser(soup):
        addDate=addDate
    )


 def libre_links_parser(soup):
    # Returning all links that should be visited by the Crawler
    href = []
--- a/Forums/OnniForums/crawler_selenium.py
+++ b/Forums/OnniForums/crawler_selenium.py
@ -31,19 +31,19 @@ baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion

 # Opens Tor Browser, crawls the website
 def startCrawling():
    opentor()
    # opentor()
    forumName = getForumName()
    driver = getAccess()
    # driver = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)

    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)

    # new_parse(forum=forumName, url=baseURL, createLog=False)
    new_parse(forum=forumName, url=baseURL, createLog=True)


 # Opens Tor Browser
@ -214,7 +214,7 @@ def getInterestedLinks():


 def crawlForum(driver):
    print("Crawling the OnniForums")
    print("Crawling the OnniForums forum")

    linksToCrawl = getInterestedLinks()

@ -288,7 +288,7 @@ def crawlForum(driver):
            print(link, e)
        i += 1

    input("Crawling OnniForums done successfully. Press ENTER to continue\n")
    print("Crawling the OnniForums forum done.")


 # Returns 'True' if the link is Topic link
--- a/Forums/Procrax/crawler_selenium.py
+++ b/Forums/Procrax/crawler_selenium.py
@ -34,7 +34,7 @@ FORUM_NAME = 'Procrax'
 def startCrawling():
    # opentor()
    # driver = getAccess()

    #
    # if driver != 'down':
    #     try:
    #         login(driver)
@ -46,7 +46,7 @@ def startCrawling():
    new_parse(
        forum=FORUM_NAME,
        url=BASE_URL, 
        createLog=False
        createLog=True
    )


@ -190,9 +190,9 @@ def getInterestedLinks():
    # # general hacking
    links.append('https://procrax.cx/forums/general-hacking.24/')
    # # hacking security tools
    links.append('https://procrax.cx/forums/hacking-security-tools.20/')
    # links.append('https://procrax.cx/forums/hacking-security-tools.20/')
    # # hacktube
    links.append('https://procrax.cx/forums/hacktube.22/')
    # links.append('https://procrax.cx/forums/hacktube.22/')
    # # cardable
    #  links.append('https://procrax.cx/forums/cardable-websites.28/')
    # #  tools
@ -205,7 +205,7 @@ def getInterestedLinks():


 def crawlForum(driver):
    print("Crawling the Procrax")
    print("Crawling the Procrax forum")

    linksToCrawl = getInterestedLinks()

@ -240,8 +240,8 @@ def crawlForum(driver):
                        savePage(driver.page_source, topic + f"page{counter}")  # very important

                        # comment out
                        # if counter == 2:
                        #     break
                        if counter == 2:
                            break

                        try:
                            page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
@ -257,10 +257,10 @@ def crawlForum(driver):
                        driver.back()

                    # comment out
                    # break
                    break

                # comment out
                if count == 20:
                if count == 1:
                    break

                try:
@ -278,7 +278,7 @@ def crawlForum(driver):
            print(link, e)
        i += 1

    input("Crawling Procrax done successfully. Press ENTER to continue\n")
    print("Crawling the Procrax forum done.")


 # Returns 'True' if the link is Topic link, may need to change for every website