debugged forums

1 year ago · baa6974be7
--- a/.idea/DW_Pipeline_Test.iml
+++ b/.idea/DW_Pipeline_Test.iml
@ -22,7 +22,6 @@
        <option value="$MODULE_DIR$/MarketPlaces/TorBay" />
        <option value="$MODULE_DIR$/MarketPlaces/TorMarket" />
        <option value="$MODULE_DIR$/MarketPlaces/ViceCity" />
        <option value="$MODULE_DIR$/Forums/AbyssForum" />
        <option value="$MODULE_DIR$/Forums/Altenens" />
        <option value="$MODULE_DIR$/Forums/Cardingleaks" />
        <option value="$MODULE_DIR$/Forums/HiddenAnswers" />
--- a/Forums/AbyssForum/crawler_selenium.py
+++ b/Forums/AbyssForum/crawler_selenium.py
@ -32,15 +32,15 @@ baseURL = 'http://qyvjopwdgjq52ehsx6paonv2ophy3p4ivfkul4svcaw6qxlzsaboyjid.onion
 def startCrawling():
    # opentor()
    forumName = getForumName()
    # driver = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)
    driver = getAccess()
    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)
    new_parse(forumName, baseURL, True)
@ -121,6 +121,8 @@ def createFFDriver():
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
    driver.maximize_window()
    return driver
 def getAccess():
@ -241,14 +243,14 @@ def crawlForum(driver):
                        driver.back()
                    # comment out
                    break
                    # break
                # comment out
                if count == 1:
                    break
                try:
                    link = driver.find_element(by=By.XPATH, value = '/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
                    link = driver.find_element(by=By.XPATH, value='/html/body/div[2]/div[2]/div[2]/div[2]/ul/li[9]/a').get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    count += 1
--- a/Forums/Altenens/crawler_selenium.py
+++ b/Forums/Altenens/crawler_selenium.py
@ -42,7 +42,7 @@ def startCrawling():
            print(driver.current_url, e)
        closetor(driver)
    # new_parse(forumName, baseURL, True)
    new_parse(forumName, baseURL, True)
 # Opens Tor Browser
@ -118,8 +118,8 @@ def createFFDriver():
    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
    ff_prof.set_preference("signon.rememberSignons", False)
    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
    ff_prof.set_preference("network.dns.disablePrefetch", True)
    ff_prof.set_preference("network.http.sendRefererHeader", 0)
    # ff_prof.set_preference("network.dns.disablePrefetch", True)
    # ff_prof.set_preference("network.http.sendRefererHeader", 0)
    ff_prof.set_preference("permissions.default.image", 3)
    ff_prof.set_preference("browser.download.folderList", 2)
    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
@ -136,6 +136,8 @@ def createFFDriver():
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
    driver.maximize_window()
    return driver
@ -253,7 +255,7 @@ def crawlForum(driver):
                        driver.back()
                    # comment out
                    break
                    # break
                # comment out
                if count == 1:
@ -272,7 +274,7 @@ def crawlForum(driver):
            print(link, e)
        i += 1
    input("Crawling Altenens forum done successfully. Press ENTER to continue\n")
    print("Crawling the Altenens forum done.")
 # Returns 'True' if the link is Topic link, may need to change for every website
--- a/Forums/BestCardingWorld/crawler_selenium.py
+++ b/Forums/BestCardingWorld/crawler_selenium.py
@ -114,6 +114,8 @@ def createFFDriver():
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
    driver.maximize_window()
    return driver
@ -238,8 +240,8 @@ def crawlForum(driver):
                        try:
                            nav = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[4]/ul')
                            li = nav.find_element_by_class_name('next')
                            page = li.find_element_by_tag_name('a').get_attribute('href')
                            li = nav.find_element(by=By.CLASS_NAME, value='next')
                            page = li.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
                            if page == "":
                                raise NoSuchElementException
                            counter += 1
@ -252,7 +254,7 @@ def crawlForum(driver):
                        driver.back()
                    # comment out
                    break
                    # break
                # comment out
                if count == 1:
@ -260,8 +262,8 @@ def crawlForum(driver):
                try:
                    bar = driver.find_element(by=By.XPATH, value='/html/body/div[1]/div[2]/div[2]/div[3]/ul')
                    next = bar.find_element_by_class_name('next')
                    link = next.find_element_by_tag_name('a').get_attribute('href')
                    next = bar.find_element(by=By.CLASS_NAME, value='next')
                    link = next.find_element(by=By.TAG_NAME, value='a').get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    count += 1
@ -273,7 +275,7 @@ def crawlForum(driver):
            print(link, e)
        i += 1
    input("Crawling BestCardingWorld forum done sucessfully. Press ENTER to continue\n")
    print("Crawling the BestCardingWorld forum done.")
 # Returns 'True' if the link is a description link
--- a/Forums/BestCardingWorld/parser.py
+++ b/Forums/BestCardingWorld/parser.py
@ -152,7 +152,7 @@ def bestcardingworld_description_parser(soup):
    # Populate the final variable (this should be a list with all fields scraped)
    row = (topic, post, user, addDate, feedback, status, reputation, sign, interest)
    row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
    # Sending the results
@ -166,15 +166,17 @@ def bestcardingworld_description_parser(soup):
 #return: 'row' that contains a variety of lists that each hold info on the listing page
 def bestcardingworld_listing_parser(soup):
    nm = 0              # this variable should receive the number of topics
    topic = []          # 1 all topics
    board = "-1"        # 2 board name (the previous level of the topic in the Forum categorization tree.
    nm = 0              # *this variable should receive the number of topics
    forum = "BestCardingWorld"   # 0 *forum name
    board = "-1"        # 1 *board name (the previous level of the topic in the Forum categorization tree.
                        # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
    view = []           # 3 number of views of each topic
    post = []           # 4 number of posts of each topic
    user = []           # 5 all users of each topic
    addDate = []        # 6 when the topic was created (difficult to find)
    href = []           # 16 this variable should receive all cleaned urls (we will use this to do the marge between Listing and Description pages)
    author = []         # 2 *all authors of each topic
    topic = []          # 3 *all topics
    views = []          # 4 number of views of each topic
    posts = []          # 5 number of posts of each topic
    href = []           # 6 this variable should receive all cleaned urls (we will use this to do the marge between
                        # Listing and Description pages)
    addDate = []        # 7 when the topic was created (difficult to find)
    # Finding the board (should be just one)
@ -187,7 +189,12 @@ def bestcardingworld_listing_parser(soup):
    itopics = soup.find('ul', {"class": "topiclist topics"}).findAll('div',{"class": "list-inner"})
    replies = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "posts"})
    views = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "views"})
    view = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "views"})
    # Counting how many topics we have found so far
    nm = len(itopics)
    index = 0
    for itopic in itopics:
@ -213,10 +220,6 @@ def bestcardingworld_listing_parser(soup):
        topics = itopic.find('a', {"class": "topictitle"}).text
        topic.append(cleanString(topics))
        # Counting how many topics we have found so far
        nm = len(topic)
        # Adding the url to the list of urls
        link = itopic.find('a', {"class": "topictitle"}).get('href')
        link = cleanLink(link)
@ -224,18 +227,18 @@ def bestcardingworld_listing_parser(soup):
        # Finding the author of the topic
        ps = itopic.find('div', {"class":"responsive-hide"}).find('a', {"class": "username-coloured"}).text
        author = ps.strip()
        user.append(cleanString(author))
        user = ps.strip()
        author.append(cleanString(user))
        # Finding the number of replies
        posts = replies[index].text.split()[0]
        posts = posts.strip()
        post.append(cleanString(posts))
        post = replies[index].text.split()[0]
        post = post.strip()
        posts.append(cleanString(post))
        # Finding the number of Views
        tview = views[index].text.split()[0]
        tview = view[index].text.split()[0]
        tview = tview.strip()
        view.append(cleanString(tview))
        views.append(cleanString(tview))
        # If no information about when the topic was added, just assign "-1" to the variable
        #CryptBB doesn't show when topic was first posted on listing page
@ -245,10 +248,9 @@ def bestcardingworld_listing_parser(soup):
        addDate.append(date_time_obj)
        #addDate.append("-1")
        index += 1
    return organizeTopics("BestCardingWorld", nm, topic, board, view, post, user, addDate, href)
    return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
 #called by the crawler to get description links on a listing page
--- a/Forums/Cardingleaks/crawler_selenium.py
+++ b/Forums/Cardingleaks/crawler_selenium.py
@ -34,15 +34,15 @@ baseURL = 'https://leaks.ws/'
 def startCrawling():
    # opentor()
    forumName = getForumName()
    # driver = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)
    driver = getAccess()
    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)
    new_parse(forumName, baseURL, True)
@ -144,6 +144,8 @@ def createFFDriver():
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
    driver.maximize_window()
    return driver
@ -159,7 +161,7 @@ def getAccess():
 # Saves the crawled html page
 def savePage(page, url):
 def savePage(driver, page, url):
    cleanPage = cleanHTML(driver, page)
    filePath = getFullPathName(url)
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
@ -242,7 +244,7 @@ def crawlForum(driver):
                            driver.get(itemURL)
                        except:
                            driver.refresh()
                        savePage(driver.page_source, topic + f"page{counter}")  # very important
                        savePage(driver, driver.page_source, topic + f"page{counter}")  # very important
                        # comment out
                        if counter == 2:
@ -261,7 +263,7 @@ def crawlForum(driver):
                        driver.back()
                    # comment out
                    break
                    # break
                # comment out
                if count == 1:
--- a/Forums/CryptBB/crawler_selenium.py
+++ b/Forums/CryptBB/crawler_selenium.py
@ -30,15 +30,15 @@ baseURL = 'http://cryptbbtg65gibadeeo2awe3j7s6evg7eklserehqr4w4e2bis5tebid.onion
 def startCrawling():
    # opentor()
    forumName = getForumName()
    # driver = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)
    driver = getAccess()
    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)
    new_parse(forumName, baseURL, True)
@ -162,6 +162,8 @@ def createFFDriver():
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
    driver.maximize_window()
    return driver
@ -289,7 +291,7 @@ def crawlForum(driver):
                        driver.back()
                    # comment out
                    break
                    # break
                # comment out
                if count == 1:
--- a/Forums/CryptBB/parser.py
+++ b/Forums/CryptBB/parser.py
@ -124,7 +124,7 @@ def cryptBB_description_parser(soup):
            stime = dt.replace('Yesterday,','').strip()
            date_time_obj = yesterday+ ', '+stime
            date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
        elif "hours ago" in dt:
        elif "hour ago" in dt or "hours ago" in dt:
            day = day.strftime('%m-%d-%Y')
            date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
            date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
--- a/Forums/HiddenAnswers/crawler_selenium.py
+++ b/Forums/HiddenAnswers/crawler_selenium.py
@ -32,15 +32,15 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion
 def startCrawling():
    # opentor()
    forumName = getForumName()
    # driver: webdriver.Firefox = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)
    driver: webdriver.Firefox = getAccess()
    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)
    new_parse(forumName, baseURL, True)
@ -121,6 +121,8 @@ def createFFDriver():
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
    driver.maximize_window()
    return driver
 def getAccess():
@ -235,7 +237,7 @@ def crawlForum(driver: webdriver.Firefox):
                        driver.back()
                    # comment out
                    break
                    # break
                # comment out
                if count == 1:
--- a/Forums/HiddenAnswers/parser.py
+++ b/Forums/HiddenAnswers/parser.py
@ -127,15 +127,18 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
        if date_posted.find("day") > 0:
            datetime_obj = datetime.now() - timedelta(days=1)
        else:
            datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
            try:
                datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
            except ValueError:
                datetime_obj = datetime.strptime(f"{date_posted}", "%b %d, %Y")
        addDate.append(datetime_obj)
        #this link will be cleaned
        listing_href = queries.find("div", {"class": "qa-q-item-title"}).find("a").get("href")
        href.append(listing_href)
 #need to change this method
    nm = len(topic)
    return organizeTopics(forum, nm, board, user, topic, view, post, href, addDate)
 #need to change this method
--- a/Forums/Initialization/forumsList.txt
+++ b/Forums/Initialization/forumsList.txt
@ -1,9 +1 @@
 AbyssForum
 Altenens
 BestCardingWorld
 Cardingleaks
 CryptBB
 HiddenAnswers
 Libre
 OnniForums
 Procrax
 BestCardingWorld
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@ -12,6 +12,7 @@ from Forums.OnniForums.parser import *
 from Forums.Altenens.parser import *
 from Forums.Procrax.parser import *
 from Forums.Libre.parser import *
 from Forums.HiddenAnswers.parser import *
 from Forums.Classifier.classify_product import predict
 # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -126,6 +127,8 @@ def parse_listing(forum, listingFile, soup, createLog, logFile):
            rw = procrax_listing_parser(soup)
        elif forum == "Libre":
            rw = libre_listing_parser(soup)
        elif forum == "HiddenAnswers":
            rw = HiddenAnswers_listing_parser(soup)
        else:
            print("MISSING CALL TO LISTING PARSER IN PREPARE_PARSER.PY!")
            raise Exception
@ -160,6 +163,8 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
            rmm = procrax_description_parser(soup)
        elif forum == "Libre":
            rmm = libre_description_parser(soup)
        elif forum == "HiddenAnswers":
            rmm = HiddenAnswers_description_parser(soup)
        else:
            print("MISSING CALL TO DESCRIPTION PARSER IN PREPARE_PARSER.PY!")
            raise Exception
--- a/Forums/Libre/crawler_selenium.py
+++ b/Forums/Libre/crawler_selenium.py
@ -30,15 +30,15 @@ baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion
 def startCrawling():
    # opentor()
    forumName = getForumName()
    # driver = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)
    driver = getAccess()
    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)
    new_parse(forumName, baseURL, True)
@ -144,6 +144,8 @@ def createFFDriver():
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
    driver.maximize_window()
    return driver
@ -255,7 +257,7 @@ def crawlForum(driver):
                        driver.back()
                    # comment out
                    break
                    # break
                # comment out
                if count == 1:
@ -275,7 +277,7 @@ def crawlForum(driver):
            print(link, e)
        i += 1
    input("Crawling the Libre forum done.")
    print("Crawling the Libre forum done.")
 # Returns 'True' if the link is Topic link, may need to change for every website
--- a/Forums/OnniForums/crawler_selenium.py
+++ b/Forums/OnniForums/crawler_selenium.py
@ -33,15 +33,15 @@ baseURL = 'http://onnii6niq53gv3rvjpi7z5axkasurk2x5w5lwliep4qyeb2azagxn4qd.onion
 def startCrawling():
    # opentor()
    forumName = getForumName()
    # driver = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)
    driver = getAccess()
    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)
    new_parse(forum=forumName, url=baseURL, createLog=True)
@ -139,6 +139,8 @@ def createFFDriver():
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
    driver.maximize_window()
    return driver
@ -267,7 +269,7 @@ def crawlForum(driver):
                        driver.back()
                    # comment out
                    break
                    # break
                # comment out
                if count == 1:
--- a/Forums/OnniForums/parser.py
+++ b/Forums/OnniForums/parser.py
@ -139,12 +139,14 @@ def onniForums_listing_parser(soup: BeautifulSoup):
    nm = len(thread_arrays)
    for thread in thread_arrays: #getting the information from the posts and sorting them into the arrays defined above
        try: 
            post_subject: str = thread.find("span",{"class": "subject_new"}).text #getting the topic
        body = thread.find("span",{"class": "subject_new"})
        try:
            post_subject: str = body.text #getting the topic
        except AttributeError:
            post_subject: str = thread.find("span",{"class": "subject_old"}).text
            body = thread.find("span",{"class": "subject_old"})
            post_subject: str = body.text
        post_subject_cleaned = cleanString(post_subject.strip())    
        topic.append(post_subject_cleaned)
@ -163,9 +165,8 @@ def onniForums_listing_parser(soup: BeautifulSoup):
        author = thread.find("span",{"class" : "author smalltext"}).text
        author_cleaned = cleanString(author.strip())
        user.append(author_cleaned)
        reply_anchor = thread.find_all("td", {"align": "center"})[2].find('a')
        thread_link = reply_anchor.get('href')
        thread_link = body.find('a').get('href')
        href.append(thread_link)
    return organizeTopics(
--- a/Forums/Procrax/crawler_selenium.py
+++ b/Forums/Procrax/crawler_selenium.py
@ -33,15 +33,15 @@ FORUM_NAME = 'Procrax'
 # Opens Tor Browser, crawls the website
 def startCrawling():
    # opentor()
    # driver = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)
    driver = getAccess()
    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)
    new_parse(
        forum=FORUM_NAME,
@ -140,6 +140,8 @@ def createFFDriver():
    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
    driver.maximize_window()
    return driver
 def getAccess():
@ -257,7 +259,7 @@ def crawlForum(driver):
                        driver.back()
                    # comment out
                    break
                    # break
                # comment out
                if count == 1: