From eacf94a0dfa3e71115e694335c69f5889775fc2c Mon Sep 17 00:00:00 2001
From: rida <ridasiddiqui123@gmail.com>
Date: Tue, 16 Jan 2024 03:28:00 -0800
Subject: [PATCH] added crawler and parser for Endchan, didn't quite understand
 connecting to database.

---
 Forums/Endchan/crawler_selenium.py | 274 +++++++++++++++++++++++++++++
 Forums/Endchan/parser.py           | 212 ++++++++++++++++++++++
 2 files changed, 486 insertions(+)
 create mode 100644 Forums/Endchan/crawler_selenium.py
 create mode 100644 Forums/Endchan/parser.py

diff --git a/Forums/Endchan/crawler_selenium.py b/Forums/Endchan/crawler_selenium.py
new file mode 100644
index 0000000..182113b
--- /dev/null
+++ b/Forums/Endchan/crawler_selenium.py
@@ -0,0 +1,274 @@
+__author__ = 'DarkWeb'
+
+'''
+CryptBB Forum Crawler (Selenium)
+'''
+
+from selenium import webdriver
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
+from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
+from selenium.webdriver.firefox.service import Service
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+#from PIL import Image
+import urllib.parse as urlparse
+import os, re, time
+import subprocess
+from bs4 import BeautifulSoup
+from Forums.Initialization.prepare_parser import new_parse
+from Forums.Endchan.parser import endchan_links_parser
+from Forums.Utilities.utilities import cleanHTML
+
+counter = 1
+baseURL = 'http://enxx3byspwsdo446jujc52ucy2pf5urdbhqw3kbsfhlfjwmbpj5smdad.onion/'
+
+
+# Opens Tor Browser, crawls the website
+def startCrawling():
+    forumName = getForumName()
+    driver = getAccess()
+
+    if driver != 'down':
+        try:
+            login(driver)
+            crawlForum(driver)
+        except Exception as e:
+            print(driver.current_url, e)
+        closeDriver(driver)
+
+    new_parse(forumName, baseURL, True)
+
+
+# Login using premade account credentials and do login captcha manually
+def login(driver):
+    input("Press ENTER when CAPTCHA is completed\n")
+    # wait for listing page show up (This Xpath may need to change based on different seed url)
+    # wait for 50 sec until id = tab_content is found, then cont
+    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
+        (By.XPATH, '//*[@id="tab_content"]')))
+
+
+# Returns the name of the website
+def getForumName() -> str:
+    name = 'Endchan'
+    return name
+
+
+# Return the link of the website
+def getFixedURL():
+    url = 'http://enxx3byspwsdo446jujc52ucy2pf5urdbhqw3kbsfhlfjwmbpj5smdad.onion/'
+    return url
+
+
+# Closes Tor Browser
+def closeDriver(driver):
+    # global pid
+    # os.system("taskkill /pid " + str(pro.pid))
+    # os.system("taskkill /t /f /im tor.exe")
+    print('Closing Tor...')
+    driver.close()  # close tab
+    time.sleep(3)
+    return
+
+
+# Creates FireFox 'driver' and configure its 'Profile'
+# to use Tor proxy and socket
+def createFFDriver():
+    from Forums.Initialization.forums_mining import config
+
+    ff_binary = FirefoxBinary(config.get('TOR', 'firefox_binary_path'))
+
+    ff_prof = FirefoxProfile(config.get('TOR', 'firefox_profile_path'))
+    ff_prof.set_preference("places.history.enabled", False)
+    ff_prof.set_preference("privacy.clearOnShutdown.offlineApps", True)
+    ff_prof.set_preference("privacy.clearOnShutdown.passwords", True)
+    ff_prof.set_preference("privacy.clearOnShutdown.siteSettings", True)
+    ff_prof.set_preference("privacy.sanitize.sanitizeOnShutdown", True)
+    ff_prof.set_preference("signon.rememberSignons", False)
+    ff_prof.set_preference("network.cookie.lifetimePolicy", 2)
+    ff_prof.set_preference("network.dns.disablePrefetch", True)
+    ff_prof.set_preference("network.http.sendRefererHeader", 0)
+    ff_prof.set_preference("permissions.default.image", 3)
+    ff_prof.set_preference("browser.download.folderList", 2)
+    ff_prof.set_preference("browser.download.manager.showWhenStarting", False)
+    ff_prof.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/plain")
+    ff_prof.set_preference('network.proxy.type', 1)
+    ff_prof.set_preference("network.proxy.socks_version", 5)
+    ff_prof.set_preference('network.proxy.socks', '127.0.0.1')
+    ff_prof.set_preference('network.proxy.socks_port', 9150)
+    ff_prof.set_preference('network.proxy.socks_remote_dns', True)
+    ff_prof.set_preference("javascript.enabled", True)
+    ff_prof.update_preferences()
+
+    service = Service(config.get('TOR', 'geckodriver_path'))
+
+    driver = webdriver.Firefox(firefox_binary=ff_binary, firefox_profile=ff_prof, service=service)
+
+    driver.maximize_window()
+
+    return driver
+
+
+def getAccess():
+    url = getFixedURL()
+    driver = createFFDriver()
+    try:
+        driver.get(url)
+        return driver
+    except:
+        driver.close()
+        return 'down'
+
+
+# Saves the crawled html page
+def savePage(driver, page, url):
+    cleanPage = cleanHTML(driver, page)
+    filePath = getFullPathName(url)
+    os.makedirs(os.path.dirname(filePath), exist_ok=True)
+    open(filePath, 'wb').write(cleanPage.encode('utf-8'))
+    return
+
+
+# Gets the full path of the page to be saved along with its appropriate file name
+def getFullPathName(url):
+    from Forums.Initialization.forums_mining import config, CURRENT_DATE
+
+    mainDir = os.path.join(config.get('Project', 'shared_folder'), "Forums/" + getForumName() + "/HTML_Pages")
+    fileName = getNameFromURL(url)
+    if not isListingLink(url):
+        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Description\\' + fileName + '.html')
+    else:
+        fullPath = os.path.join(mainDir, CURRENT_DATE + r'\\Listing\\' + fileName + '.html')
+    return fullPath
+
+
+# Creates the file name from passed URL
+def getNameFromURL(url):
+    global counter
+    name = ''.join(e for e in url if e.isalnum())
+    if name == '':
+        name = str(counter)
+        counter = counter + 1
+    return name
+
+
+def getInterestedLinks():
+    links = []
+    # /tech/ - Technology
+    links.append('http://enxx3byspwsdo446jujc52ucy2pf5urdbhqw3kbsfhlfjwmbpj5smdad.onion/tech/')
+    # /g/ - Technolo/g/y
+    links.append('http://enxx3byspwsdo446jujc52ucy2pf5urdbhqw3kbsfhlfjwmbpj5smdad.onion/g/')
+    # /os/ - Online Security
+    links.append('http://enxx3byspwsdo446jujc52ucy2pf5urdbhqw3kbsfhlfjwmbpj5smdad.onion/os/')
+    # /hack/ - Hacking
+    links.append('http://enxx3byspwsdo446jujc52ucy2pf5urdbhqw3kbsfhlfjwmbpj5smdad.onion/hack/')
+    # http://enxx3byspwsdo446jujc52ucy2pf5urdbhqw3kbsfhlfjwmbpj5smdad.onion / tech + g + markov + os + agdg + cyber + HTML + 2600
+
+    return links
+
+
+def crawlForum(driver):
+    print("Crawling the Endchan forum")
+
+    linksToCrawl = getInterestedLinks()
+    i = 0
+    while i < len(linksToCrawl):
+        link = linksToCrawl[i]
+        print('Crawling :', link)
+        try:
+            has_next_page = True
+            count = 0
+
+            while has_next_page:
+                try:
+                    driver.get(link)
+                except:
+                    driver.refresh()
+                html = driver.page_source
+                savePage(driver, html, link)
+
+                topics = topicPages(html)
+                for topic in topics:
+                    has_next_topic_page = True
+                    counter = 1
+                    page = topic
+
+                    while has_next_topic_page:
+                        itemURL = urlparse.urljoin(baseURL, str(page))
+                        try:
+                            driver.get(itemURL)
+                        except:
+                            driver.refresh()
+
+                        if isListingLink(driver.current_url):
+                            break
+
+                        savePage(driver, driver.page_source, topic + f"page{counter}")  # very important
+
+                        # # comment out
+                        # if counter == 2:
+                        #     break
+
+                        try:
+                            page = driver.find_element(By.ID, value='linkNext').get_attribute('href')
+                            if page == "":
+                                raise NoSuchElementException
+                            counter += 1
+
+                        except NoSuchElementException:
+                            has_next_topic_page = False
+
+                    # making sure we go back to the listing page (browser back button simulation)
+                    try:
+                        driver.get(link)
+                    except:
+                        driver.refresh()
+
+                #     # comment out
+                #     break
+                #
+                # # comment out
+                # if count == 1:
+                #     break
+
+                try:
+                    link = driver.find_element(By.ID, value='linkNext').get_attribute('href')
+                    if link == "":
+                        raise NoSuchElementException
+                    count += 1
+
+                except NoSuchElementException:
+                    has_next_page = False
+
+        except Exception as e:
+            print(link, e)
+        i += 1
+
+    print("Crawling the Endchan forum done.")
+
+
+# Returns 'True' if the link is Topic link, may need to change for every website
+def isDescriptionLink(url):
+    if '/res/' in url:
+        return True
+    return False
+
+# Returns True if the link is a listingPage link
+def isListingLink(url):
+    if re.match(".*onion/.*/" , url) and '/res/' not in url:
+        return True
+    return False
+
+
+# calling the parser to define the links
+def topicPages(html):
+    soup = BeautifulSoup(html, "html.parser")
+    return endchan_links_parser(soup)
+
+
+def crawler():
+    startCrawling()
+
diff --git a/Forums/Endchan/parser.py b/Forums/Endchan/parser.py
new file mode 100644
index 0000000..07b39e0
--- /dev/null
+++ b/Forums/Endchan/parser.py
@@ -0,0 +1,212 @@
+# Here, we are importing the auxiliary functions to clean or convert data
+from Forums.Utilities.utilities import *
+import datetime
+import re
+
+# Here, we are importing BeautifulSoup to search through the HTML tree
+from bs4 import BeautifulSoup
+
+# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
+
+def endchan_description_parser(soup):
+
+    # Fields to be parsed
+
+    topic = "-1"            # 0 *topic name
+    user = []               # 1 *all users of each post
+    status = []             # 2 all user's authority in each post such as (adm, member, dangerous)
+    reputation = []         # 3 all user's karma in each post (usually found as a number)
+    interest = []           # 4 all user's interest in each post
+    sign = []               # 5 all user's signature in each post (usually a standard message after the content of the post)
+    post = []               # 6 all messages of each post
+    feedback = []           # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
+    addDate = []            # 8 all dates of each post
+    image_user = []         # 9 all user avatars of each post
+    image_post = []         # 10 all first images of each post
+
+    # Finding the topic (should be just one coming from the Listing Page)
+    entire_post = soup.find('div', {"id": "threadList"}).find('div', {"id": "divThreads"}).find('div',
+                                                                                               class_ = re.compile('opCell'))
+
+    original_post = entire_post.find('div', {"class": "innerOP"})
+    post_header = original_post.find('div', {"class": "opHead"})
+
+    topic = post_header.find('span', {"class": "labelSubject"}).text
+    topic = re.sub("\[\w*\]", '', topic)
+    topic = topic.replace(",","")
+    topic = topic.replace("\n","")
+    topic = cleanString(topic.strip())
+
+    # the replies are separated from the original post, so have to get original post and then get repeated tags for the replies
+    #functions to reuse code:
+    def get_user(area):
+        name = area.find('a', class_=re.compile('linkName'))
+        author = name.text.strip()
+        #user.append(cleanString(author))
+
+    def get_post(area):
+        content = area.find('div', {"class": "divMessage"})
+        content = content.text.strip()
+        #post.append(cleanString(content))
+
+    def get_date(area):
+        dt = area.find('span', {"class": "labelCreated"}).text
+        dt = dt.strip().split()
+        date_time_obj = datetime.strftime(dt[0], '%m-%d-%Y')
+        #addDate.append(date_time_obj)
+
+    def get_user_img(area):
+        avatar_img = area.find('img', class_= re.compile('imgFlag'))
+        if avatar_img is not None:
+            avatar_img = avatar_img.get('src').split('base64,')[-1]
+        else:
+            avatar_img = "-1"
+        #image_user.append(avatar_img)
+
+    def get_first_img(area):
+        img_cell = area.find('div', class_= re.compile('panelUploads')).find('figure', {"class": "uploadCell"})
+        if img_cell is not None:
+            img = img_cell.find('img')
+            img = img.get('src').split('base64,')[-1]
+        else:
+            img = "-1"
+        #image_post.append(img)
+
+    # Endchan does not have status, blurb, reputation, signature or feedback
+    def set_other_lists():
+        status.append("-1")
+        reputation.append("-1")
+        interest.append("-1")
+        sign.append("-1")
+        feedback.append("-1")
+
+    # For the original post, get all fields we are interested in
+    # get user
+    get_user(post_header)
+    # get post
+    get_post(original_post)
+    # get addDate
+    get_date(post_header)
+    # get user image
+    get_user_img(post_header)
+    #get first post image
+    get_first_img(original_post)
+    #no status, interest, reputation, feedback, or signature
+    #set_other_lists()
+
+    # Finding the repeated tag that corresponds to the listing of posts
+    post_replies = entire_post.find('div', {"class": "divPosts"}).find_all('div', class_ = re.compile('postCell'))
+
+    # For all replies, get all the fields we are interested in
+    for ipost in post_replies:
+        post_area =  ipost.find('div', {"class": "innerPost"})
+
+        # Finding user of the post
+        get_user(post_area)
+        # getting post date
+        get_date(post_area)
+        # getting the post content
+        get_post(post_area)
+        # get first image from post using panel uploads
+        get_first_img(post_area)
+        # get author avatar
+        get_user_img(post_area)
+        #set_other_lists()
+
+
+    # Populate the final variable (this should be a list with all fields scraped)
+
+    #row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
+    # Sending the results
+    #return row
+
+# This is the method to parse the Listing Pages (one page with many posts)
+def endchan_listing_parser(soup):
+
+    nm = 0              # *this variable should receive the number of topics
+    forum = "Endchan"   # 0 *forum name
+    board = "-1"        # 1 *board name (the previous level of the topic in the Forum categorization tree.
+                        # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
+    author = []         # 2 *all authors of each topic
+    topic = []          # 3 *all topics
+    views = []          # 4 number of views of each topic
+    posts = []          # 5 number of posts of each topic
+    href = []           # 6 this variable should receive all cleaned urls (we will use this to do the marge between
+                        # Listing and Description pages)
+    addDate = []        # 7 when the topic was created (difficult to find)
+    image_author = []   # 8 all author avatars used in each topic
+
+
+    # Finding the board (should be just one)
+
+    header = soup.find("header", {"class": "boardHeader"})
+    labelName = header.find("p", {"id": "labelName"}).text
+    board = cleanString(labelName.strip())
+
+
+    # Finding the repeated tag that corresponds to the listing of topics
+
+    topics = soup.find('div', {"id": "threadList"}).find('div', {"id": "divThreads"}).find_all('div',
+                                                                                              class_ = re.compile('opCell'))
+    # Counting how many topics
+    nm = len(topics)
+
+    for itopic in topics:
+        post_header = itopic.find('div', {"class": "innerOP"}).find('div', {"class": "opHead"})
+        topics = post_header.find('span', {"class": "labelSubject"})
+
+        # Adding the topic to the topic list
+        topics = re.sub("\[\w*\]", '', topics)
+        topic.append(cleanString(topics))
+
+        # get author avatar
+        avatar_img = post_header.find('img', class_ = re.compile('imgFlag'))
+        if avatar_img is not None:
+            avatar_img = avatar_img.get('src').split('base64,')[-1]
+        else:
+            avatar_img = "-1"
+        image_author.append(avatar_img)
+
+        # Adding the url to the list of urls
+        #using linkSelf to get link, because the website is formatted differently.
+        link = post_header.find('a', {"class": "linkSelf"}).get('href')
+        href.append(link)
+
+        # Finding the author of the topic
+        name = post_header.find('a', class_= re.compile('linkName'))
+        user = name.strip()
+        author.append(cleanString(user))
+
+        # Finding the number of replies
+        replies = itopic.find('div', {"class": "divPosts"}).find_all('div', class_ = re.compile('postCell'))
+        if replies is not None:
+            num_replies = str(len(replies))
+            posts.append(cleanString(num_replies))
+        else:
+            posts.append('-1')
+
+        # No information on number of Views
+        views.append('-1')
+
+        #get date topic was added
+        dt = post_header.find('span', {"class": "labelCreated"}).text
+        dt = dt.strip().split()
+        date_time_obj = datetime.strftime(dt[0], '%m-%d-%Y')
+        addDate.append(date_time_obj)
+
+    #return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate, image_author)
+
+
+def endchan_links_parser(soup):
+
+    # Returning all links that should be visited by the Crawler
+
+    href = []
+
+    listing =  soup.find('div', {"id": "threadList"}).find('div', {"id": "divThreads"}).find_all('div',
+                                                                                              class_ = re.compile('opCell'))
+
+    for a in listing:
+        link = a.find('a', {"class": "linkSelf"}).get('href')
+        href.append(link)
+    return href