From 6e1a1b6b2a35d40f0fb954826b88bfe407ccf493 Mon Sep 17 00:00:00 2001
From: Khoi <minhkhoitran2k3@gmail.com>
Date: Thu, 20 Jul 2023 14:03:59 -0700
Subject: [PATCH] completed Libre forum (not tested yet)

---
 Forums/Initialization/forumsList.txt    |   2 +-
 Forums/Initialization/prepare_parser.py |   5 +
 Forums/Libre/crawler_selenium.py        |  18 +-
 Forums/Libre/parser.py                  | 264 ++++++++++--------------
 4 files changed, 127 insertions(+), 162 deletions(-)

diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt
index 6f635a1..3010d1b 100644
--- a/Forums/Initialization/forumsList.txt
+++ b/Forums/Initialization/forumsList.txt
@@ -1 +1 @@
-Cardingleaks
\ No newline at end of file
+Libre
\ No newline at end of file
diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py
index 1c624b3..a1ef429 100644
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@@ -11,6 +11,7 @@ from Forums.CryptBB.parser import *
 from Forums.OnniForums.parser import *
 from Forums.Altenens.parser import *
 from Forums.Procrax.parser import *
+from Forums.Libre.parser import *
 
 from Forums.Classifier.classify_product import predict
 # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@@ -160,6 +161,8 @@ def new_parse(forum, url, createLog):
                 rmm = altenens_description_parser(soup)
             elif forum == "Procrax":
                 rmm = procrax_description_parser(soup)
+            elif forum == "Libre":
+                rmm = libre_description_parser(soup)
 
             # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
             key = u"Url:" + os.path.basename(line2).replace(".html", "")
@@ -243,6 +246,8 @@ def new_parse(forum, url, createLog):
                     rw = altenens_listing_parser(soup)
                 elif forum == "Procrax":
                     rw = procrax_listing_parser(soup)
+                elif forum == "Libre":
+                    rw = libre_listing_parser(soup)
 
             except:
 
diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py
index dfef8db..a2ba332 100644
--- a/Forums/Libre/crawler_selenium.py
+++ b/Forums/Libre/crawler_selenium.py
@@ -28,17 +28,17 @@ baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion
 
 # Opens Tor Browser, crawls the website
 def startCrawling():
-    opentor()
+    # opentor()
     forumName = getForumName()
-    driver = getAccess()
+    # driver = getAccess()
     
-    if driver != 'down':
-        try:
-            login(driver)
-            crawlForum(driver)
-        except Exception as e:
-            print(driver.current_url, e)
-        closetor(driver)
+    # if driver != 'down':
+    #     try:
+    #         login(driver)
+    #         crawlForum(driver)
+    #     except Exception as e:
+    #         print(driver.current_url, e)
+    #     closetor(driver)
 
     new_parse(forumName, baseURL, False)
 
diff --git a/Forums/Libre/parser.py b/Forums/Libre/parser.py
index 093c671..1991d7a 100644
--- a/Forums/Libre/parser.py
+++ b/Forums/Libre/parser.py
@@ -12,150 +12,92 @@ from bs4 import BeautifulSoup
 # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
 
 
-def cryptBB_description_parser(soup):
-
+def libre_description_parser(soup):
     # Fields to be parsed
 
-    topic = "-1"            # 0 *topic name
-    user = []               # 1 *all users of each post
-    status = []             # 2 all user's authority in each post such as (adm, member, dangerous)
-    reputation = []         # 3 all user's karma in each post (usually found as a number)
-    interest = []           # 4 all user's interest in each post
-    sign = []               # 5 all user's signature in each post (usually a standard message after the content of the post)
-    post = []               # 6 all messages of each post
-    feedback = []           # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
-    addDate = []            # 8 all dates of each post
+    topic = "-1"  # 0 *topic name
+    user = []  # 1 *all users of each post
+    status = []  # 2 all user's authority in each post such as (adm, member, dangerous)
+    reputation = []  # 3 all user's karma in each post (usually found as a number)
+    interest = []  # 4 all user's interest in each post
+    sign = []  # 5 all user's signature in each post (usually a standard message after the content of the post)
+    post = []  # 6 all messages of each post
+    feedback = []  # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
+    addDate = []  # 8 all dates of each post
 
     # Finding the topic (should be just one coming from the Listing Page)
 
-    li = soup.find("td", {"class": "thead"}).find('strong')
-    topic = li.text
-    topic = re.sub("\[\w*\]", '', topic)
-
-    topic = topic.replace(",","")
-    topic = topic.replace("\n","")
-    topic = cleanString(topic.strip())
+    topic_found = soup.find("a", {"class": "link text-xl text-zinc-300"}).text
+    topic = cleanString(topic_found.strip())
 
     # Finding the repeated tag that corresponds to the listing of posts
 
     # try:
-    posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
-        'div', {"class": "post"})
+    posts = soup.find_all("div", {"class": "flex items-stretch"})
 
     # For each message (post), get all the fields we are interested to:
 
     for ipost in posts:
-
         # Finding a first level of the HTML page
 
-        post_wrapper = ipost.find('span', {"class": "largetext"})
-
         # Finding the author (user) of the post
 
-        author = post_wrapper.text.strip()
-        user.append(cleanString(author))  # Remember to clean the problematic characters
-
-        # Finding the status of the author
-
-        smalltext = ipost.find('div', {"class": "post_author"})
-
-        '''
-        # Testing here two possibilities to find this status and combine them
-        if ipost.find('div', {"class": "deleted_post_author"}):
-            status.append(-1)
-            interest.append(-1)
-            reputation.append(-1)
-            addDate.append(-1)
-            post.append("THIS POST HAS BEEN REMOVED!")
-            sign.append(-1)
-            feedback.append(-1)
-            continue
-        '''
-
-        # CryptBB does have membergroup and postgroup
-
-        membergroup = smalltext.find('div', {"class": "profile-rank"})
-        postgroup = smalltext.find('div', {"class": "postgroup"})
-        if membergroup != None:
-            membergroup = membergroup.text.strip()
-            if postgroup != None:
-                postgroup = postgroup.text.strip()
-                membergroup = membergroup + " - " + postgroup
-        else:
-            if postgroup != None:
-                membergroup = postgroup.text.strip()
-            else:
-                membergroup = "-1"
-        status.append(cleanString(membergroup))
+        user_name = ipost.find("a", {"class": "link"}).text
+        user_name_cleaned = user_name.split("/")[1]
+        user.append(cleanString(user_name_cleaned))  # Remember to clean the problematic characters
+
+        status.append("-1")
 
         # Finding the interest of the author
         # CryptBB does not have blurb
-        blurb = smalltext.find('li', {"class": "blurb"})
-        if blurb != None:
-            blurb = blurb.text.strip()
-        else:
-            blurb = "-1"
-        interest.append(cleanString(blurb))
+
+        interest.append("-1")
 
         # Finding the reputation of the user
         # CryptBB does have reputation
-        author_stats = smalltext.find('div', {"class": "author_statistics"})
-        karma = author_stats.find('strong')
-        if karma != None:
-            karma = karma.text
-            karma = karma.replace("Community Rating: ", "")
-            karma = karma.replace("Karma: ", "")
-            karma = karma.strip()
-        else:
-            karma = "-1"
-        reputation.append(cleanString(karma))
+
+        karma = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
+        karma_cleaned = karma.split(" ")[6]
+        reputation.append(cleanString(karma_cleaned.strip()))
 
         # Getting here another good tag to find the post date, post content and users' signature
 
-        postarea = ipost.find('div', {"class": "post_content"})
-
-        dt = postarea.find('span', {"class": "post_date"}).text
-        # dt = dt.strip().split()
-        dt = dt.strip()
-        day=date.today()
-        if "Yesterday" in dt:
-            yesterday = day - timedelta(days=1)
-            yesterday = yesterday.strftime('%m-%d-%Y')
-            stime = dt.replace('Yesterday,','').strip()
-            date_time_obj = yesterday+ ', '+stime
-            date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
-        elif "hours ago" in dt:
-            day = day.strftime('%m-%d-%Y')
-            date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
-            date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
-        else:
-            date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
-            stime = date_time_obj.strftime('%b %d, %Y')
-            sdate = date_time_obj.strftime('%I:%M %p')
-        addDate.append(date_time_obj)
+        date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
+        date_time_cleaned = date_posted.replace(user_name, "")[3:-12]
+        datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT")
+        addDate.append(datetime_append)
 
         # Finding the post
-
-        inner = postarea.find('div', {"class": "post_body scaleimages"})
-        inner = inner.text.strip()
-        post.append(cleanString(inner))
+        user_post = ipost.find("div", {"class": "content-c"}).text
+        post.append(cleanString(user_post))
 
         # Finding the user's signature
 
         # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
-        signature = ipost.find('div', {"class": "signature scaleimages"})
-        if signature != None:
-            signature = signature.text.strip()
-            # print(signature)
-        else:
-            signature = "-1"
-        sign.append(cleanString(signature))
+
+        sign.append("-1")
 
         # As no information about user's feedback was found, just assign "-1" to the variable
 
         feedback.append("-1")
 
     # Populate the final variable (this should be a list with all fields scraped)
+    # print(topic)
+    # print(user)
+    # print(status)
+    # print(reputation)
+    # print(interest)
+    # print(sign)
+    # print(post)
+    # print(feedback)
+    # print(addDate)
+    # print(len(user))
+    # print(len(status))
+    # print(len(reputation))
+    # print(len(interest))
+    # print(len(sign))
+    # print(len(feedback))
+    # print(len(addDate))
 
     row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
 
@@ -163,76 +105,94 @@ def cryptBB_description_parser(soup):
 
     return row
 
-# This is the method to parse the Listing Pages (one page with many posts)
 
-def cryptBB_listing_parser(soup):
+# This is the method to parse the Listing Pages (one page with many posts)
 
-    nm = 0              # *this variable should receive the number of topics
-    forum = "OnniForums"   # 0 *forum name
-    board = "-1"        # 1 *board name (the previous level of the topic in the Forum categorization tree.
-                        # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
-    author = []         # 2 *all authors of each topic
-    topic = []          # 3 *all topics
-    views = []          # 4 number of views of each topic
-    posts = []          # 5 number of posts of each topic
-    href = []           # 6 this variable should receive all cleaned urls (we will use this to do the marge between
-                        # Listing and Description pages)
-    addDate = []        # 7 when the topic was created (difficult to find)
+def libre_listing_parser(soup):
+    nm = 0  # *this variable should receive the number of topics
+    forum = "Libre"  # 0 *forum name
+    board = "-1"  # 1 *board name (the previous level of the topic in the Forum categorization tree.
+    # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
+    author = []  # 2 *all authors of each topic
+    topic = []  # 3 *all topics
+    views = []  # 4 number of views of each topic
+    posts = []  # 5 number of posts of each topic
+    href = []  # 6 this variable should receive all cleaned urls (we will use this to do the marge between
+    # Listing and Description pages)
+    addDate = []  # 7 when the topic was created (difficult to find)
 
     # Finding the board (should be just one)
 
-    board = soup.find('span', {"class": "active"}).text
+    board = soup.find('div', {"class": "title"}).find("h1").text
     board = cleanString(board.strip())
 
     # Finding the repeated tag that corresponds to the listing of topics
 
-    itopics = soup.find_all('tr',{"class": "inline_row"})
+    itopics = soup.find("div", {"class", "space-y-2 mt-4"}).find_all('div', {"class": "flex box"})
 
+    nm = 0
     for itopic in itopics:
-
+        nm += 1
         # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
         # to don't miss any topic
 
         # Adding the topic to the topic list
-        try:
-            topics = itopic.find('span', {"class": "subject_old"}).find('a').text
-        except:
-            topics = itopic.find('span', {"class": "subject_new"}).find('a').text
-        topics = re.sub("\[\w*\]", '', topics)
-        topic.append(cleanString(topics))
-
-        # Counting how many topics we have found so far
-
-        nm = len(topic)
+        topic_string = itopic.find("a", {"class": "link text-xl text-zinc-300"}).text
+        cleaned_topic_string = cleanString(topic_string.strip())
+        topic.append(cleaned_topic_string)
 
         # Adding the url to the list of urls
-        try:
-            link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
-        except:
-            link = itopic.find('span',{"class": "subject_new"}).find('a').get('href')
-        href.append(link)
+        link_to_clean = itopic.find("a", {"class": "link text-xl text-zinc-300"}).get("href")
 
-        # Finding the author of the topic
-        ps = itopic.find('div', {"class":"author smalltext"}).find('a').text
-        user = ps.strip()
-        author.append(cleanString(user))
+        href.append(link_to_clean)
 
-        # Finding the number of replies
-        columns = itopic.findChildren('td',recursive=False)
-        replies = columns[3].text
+        # Finding the author of the topic
+        username_not_cleaned = itopic.find('div', {"class": "flex-grow p-2 text-justify"}).find('a').text
+        username_cleaned = username_not_cleaned.split("/")[-1]
+        author.append(cleanString(username_cleaned))
 
-        posts.append(cleanString(replies))
+        # Finding the number of views
+        num_views = itopic.find_all("div", {"class": "flex items-center"})[0].find("p").text
+        views.append(cleanString(num_views))
 
-        # Finding the number of Views
-        tview = columns[4].text
-        views.append(cleanString(tview))
+        # Finding the number of replies
+        num_replies = itopic.find_all("div", {"class": "flex items-center"})[1].find("p").text
+        posts.append(cleanString(num_replies))
 
         # If no information about when the topic was added, just assign "-1" to the variable
 
-        addDate.append("-1")
-
-    return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
-
+        date_time_concatenated = itopic.find("p", {"class": "text-sm text-zinc-400 italic"}).text
+        date_time_cleaned = date_time_concatenated.replace(username_not_cleaned, "")
+        # creating the datetime object
+        date_time_array = date_time_cleaned[3:]
+        datetime_append = datetime.strptime(date_time_array, "%Y-%m-%d %H:%M:%S GMT")
+        addDate.append(datetime_append)
+
+    # print(forum)
+    # print(nm)
+    # print(board)
+    # print(author)
+    # print(topic)
+    # print(views)
+    # print(href)
+    # print(addDate)
+    # print(len(author))
+    # print(len(topic))
+    # print(len(views))
+    # print(len(href))
+    # print(len(addDate))
+
+    return organizeTopics(
+        forum=forum,
+        nm=nm,
+        board=board,
+        author=author,
+        topic=topic,
+        views=views,
+        posts=posts,
+        href=href,
+        addDate=addDate
+    )
 
 def libre_links_parser(soup):
     # Returning all links that should be visited by the Crawler