From 9a64698a9c91ef02dddb8d4c5615659026821ddd Mon Sep 17 00:00:00 2001
From: Khoi <minhkhoitran2k3@gmail.com>
Date: Wed, 4 Oct 2023 16:21:56 -0700
Subject: [PATCH] last change before REU ended

---
 Forums/HiddenAnswers/crawler_selenium.py | 26 +++++------
 Forums/HiddenAnswers/parser.py           | 57 ++++++++++++++++--------
 Forums/Initialization/prepare_parser.py  |  5 +++
 3 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/HiddenAnswers/crawler_selenium.py
index bb73764..d225fa2 100644
--- a/Forums/HiddenAnswers/crawler_selenium.py
+++ b/Forums/HiddenAnswers/crawler_selenium.py
@@ -30,19 +30,19 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion
 
 # Opens Tor Browser, crawls the website
 def startCrawling():
-    opentor()
-    # forumName = getForumName()
-    driver: webdriver.Firefox = getAccess()
-
-    if driver != 'down':
-        try:
-            login(driver)
-            crawlForum(driver)
-        except Exception as e:
-            print(driver.current_url, e)
-        closetor(driver)
-
-    # new_parse(forumName, baseURL, False)
+    # opentor()
+    forumName = getForumName()
+    # driver: webdriver.Firefox = getAccess()
+
+    # if driver != 'down':
+    #     try:
+    #         login(driver)
+    #         crawlForum(driver)
+    #     except Exception as e:
+    #         print(driver.current_url, e)
+    #     closetor(driver)
+
+    new_parse(forumName, baseURL, False)
 
 
 # Opens Tor Browser
diff --git a/Forums/HiddenAnswers/parser.py b/Forums/HiddenAnswers/parser.py
index 1a3ee2d..4cf36d5 100644
--- a/Forums/HiddenAnswers/parser.py
+++ b/Forums/HiddenAnswers/parser.py
@@ -38,14 +38,20 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
     datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S")
     addDate.append(datetime_obj)
     
-    question_user_status = question.find("span", {"class": "qa-q-view-who-title"}).text
-    status.append(cleanString(question_user_status.strip()))
+    try:
+        question_user_status = question.find("span", {"class": "qa-q-view-who-title"}).text
+        status.append(cleanString(question_user_status.strip()))
+    except AttributeError:
+        status.append("-1")
     
-    question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}).text
-    # Convert karma to pure numerical string
-    if question_user_karma.find("k") > -1:
-        question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000)
-    reputation.append(cleanString(question_user_karma.strip()))
+    try:
+        question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}).text
+        # Convert karma to pure numerical string
+        if question_user_karma.find("k") > -1:
+            question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000)
+        reputation.append(cleanString(question_user_karma.strip()))
+    except AttributeError:
+        reputation.append("-1")
     
     question_content = question.find("div", {"class": "qa-q-view-content qa-post-content"}).text
     post.append(cleanString(question_content.strip()))
@@ -71,14 +77,20 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
         post_data = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).text
         post.append(cleanString(post_data.strip()))
         
-        user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}).text
-        status.append(cleanString(user_reputations.strip()))
+        try:
+            user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}).text
+            status.append(cleanString(user_reputations.strip()))
+        except AttributeError:
+            status.append("-1")
         
-        karma = replies.find("span", {"class": "qa-a-item-who-points-data"}).text
-        # Convert karma to pure numerical string
-        if karma.find("k") > -1:
-            karma = str(float(karma.replace("k", "")) * 1000)
-        reputation.append(cleanString(karma.strip()))
+        try:
+            karma = replies.find("span", {"class": "qa-a-item-who-points-data"}).text
+            # Convert karma to pure numerical string
+            if karma.find("k") > -1:
+                karma = str(float(karma.replace("k", "")) * 1000)
+            reputation.append(cleanString(karma.strip()))
+        except AttributeError:
+            reputation.append("-1")
         
         feedback.append("-1")
         sign.append("-1")
@@ -114,7 +126,7 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
         topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text
         topic.append(cleanString(topic_of_query.strip()))
         
-        author = queries.find("span", {"class": "qa-q-item-who-data"}).find("a").text
+        author = queries.find("span", {"class": "qa-q-item-who-data"}).text
         user.append(cleanString(author.strip()))
         
         num_answers = queries.find("span", {"class": "qa-a-count-data"}).text
@@ -124,10 +136,19 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
         
         date_posted = queries.find("span", {"class": "qa-q-item-when-data"}).text
         
-        if date_posted.find("day") > 0:
-            datetime_obj = datetime.now() - timedelta(days=1)
+        if date_posted.find("minute") > 0:
+            minutes_ago = date_posted.split(' ')[0]
+            datetime_obj = datetime.now() - timedelta(minutes=int(minutes_ago))
+        
+        elif date_posted.find("day") > 0:
+            days_ago = date_posted.split(' ')[0]
+            datetime_obj = datetime.now() - timedelta(days=int(days_ago))
+            
+        elif bool(re.search(r"\d{4}", date_posted)):
+             datetime_obj = datetime.strptime(date_posted, "%b %d, %Y")
+        
         else:
-            datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
+            datetime_obj = datetime.strptime(f"{date_posted}, {date.today().year}", "%b %d, %Y")
         addDate.append(datetime_obj)
         #this link will be cleaned
         
diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py
index a1ef429..b425af4 100644
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@@ -12,6 +12,7 @@ from Forums.OnniForums.parser import *
 from Forums.Altenens.parser import *
 from Forums.Procrax.parser import *
 from Forums.Libre.parser import *
+from Forums.HiddenAnswers.parser import *
 
 from Forums.Classifier.classify_product import predict
 # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@@ -163,6 +164,8 @@ def new_parse(forum, url, createLog):
                 rmm = procrax_description_parser(soup)
             elif forum == "Libre":
                 rmm = libre_description_parser(soup)
+            elif forum == "HiddenAnswers":
+                rmm = HiddenAnswers_description_parser(soup)
 
             # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
             key = u"Url:" + os.path.basename(line2).replace(".html", "")
@@ -248,6 +251,8 @@ def new_parse(forum, url, createLog):
                     rw = procrax_listing_parser(soup)
                 elif forum == "Libre":
                     rw = libre_listing_parser(soup)
+                elif forum == "HiddenAnswers":
+                    rw = HiddenAnswers_listing_parser(soup)
 
             except: