From 9a64698a9c91ef02dddb8d4c5615659026821ddd Mon Sep 17 00:00:00 2001 From: Khoi Date: Wed, 4 Oct 2023 16:21:56 -0700 Subject: [PATCH] last change before REU ended --- Forums/HiddenAnswers/crawler_selenium.py | 26 +++++------ Forums/HiddenAnswers/parser.py | 57 ++++++++++++++++-------- Forums/Initialization/prepare_parser.py | 5 +++ 3 files changed, 57 insertions(+), 31 deletions(-) diff --git a/Forums/HiddenAnswers/crawler_selenium.py b/Forums/HiddenAnswers/crawler_selenium.py index bb73764..d225fa2 100644 --- a/Forums/HiddenAnswers/crawler_selenium.py +++ b/Forums/HiddenAnswers/crawler_selenium.py @@ -30,19 +30,19 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion # Opens Tor Browser, crawls the website def startCrawling(): - opentor() - # forumName = getForumName() - driver: webdriver.Firefox = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) - - # new_parse(forumName, baseURL, False) + # opentor() + forumName = getForumName() + # driver: webdriver.Firefox = getAccess() + + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) + + new_parse(forumName, baseURL, False) # Opens Tor Browser diff --git a/Forums/HiddenAnswers/parser.py b/Forums/HiddenAnswers/parser.py index 1a3ee2d..4cf36d5 100644 --- a/Forums/HiddenAnswers/parser.py +++ b/Forums/HiddenAnswers/parser.py @@ -38,14 +38,20 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S") addDate.append(datetime_obj) - question_user_status = question.find("span", {"class": "qa-q-view-who-title"}).text - status.append(cleanString(question_user_status.strip())) + try: + question_user_status = question.find("span", {"class": "qa-q-view-who-title"}).text + status.append(cleanString(question_user_status.strip())) + except AttributeError: + status.append("-1") - question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}).text - # Convert karma to pure numerical string - if question_user_karma.find("k") > -1: - question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000) - reputation.append(cleanString(question_user_karma.strip())) + try: + question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}).text + # Convert karma to pure numerical string + if question_user_karma.find("k") > -1: + question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000) + reputation.append(cleanString(question_user_karma.strip())) + except AttributeError: + reputation.append("-1") question_content = question.find("div", {"class": "qa-q-view-content qa-post-content"}).text post.append(cleanString(question_content.strip())) @@ -71,14 +77,20 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup): post_data = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).text post.append(cleanString(post_data.strip())) - user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}).text - status.append(cleanString(user_reputations.strip())) + try: + user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}).text + status.append(cleanString(user_reputations.strip())) + except AttributeError: + status.append("-1") - karma = replies.find("span", {"class": "qa-a-item-who-points-data"}).text - # Convert karma to pure numerical string - if karma.find("k") > -1: - karma = str(float(karma.replace("k", "")) * 1000) - reputation.append(cleanString(karma.strip())) + try: + karma = replies.find("span", {"class": "qa-a-item-who-points-data"}).text + # Convert karma to pure numerical string + if karma.find("k") > -1: + karma = str(float(karma.replace("k", "")) * 1000) + reputation.append(cleanString(karma.strip())) + except AttributeError: + reputation.append("-1") feedback.append("-1") sign.append("-1") @@ -114,7 +126,7 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup): topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text topic.append(cleanString(topic_of_query.strip())) - author = queries.find("span", {"class": "qa-q-item-who-data"}).find("a").text + author = queries.find("span", {"class": "qa-q-item-who-data"}).text user.append(cleanString(author.strip())) num_answers = queries.find("span", {"class": "qa-a-count-data"}).text @@ -124,10 +136,19 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup): date_posted = queries.find("span", {"class": "qa-q-item-when-data"}).text - if date_posted.find("day") > 0: - datetime_obj = datetime.now() - timedelta(days=1) + if date_posted.find("minute") > 0: + minutes_ago = date_posted.split(' ')[0] + datetime_obj = datetime.now() - timedelta(minutes=int(minutes_ago)) + + elif date_posted.find("day") > 0: + days_ago = date_posted.split(' ')[0] + datetime_obj = datetime.now() - timedelta(days=int(days_ago)) + + elif bool(re.search(r"\d{4}", date_posted)): + datetime_obj = datetime.strptime(date_posted, "%b %d, %Y") + else: - datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y") + datetime_obj = datetime.strptime(f"{date_posted}, {date.today().year}", "%b %d, %Y") addDate.append(datetime_obj) #this link will be cleaned diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index a1ef429..b425af4 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -12,6 +12,7 @@ from Forums.OnniForums.parser import * from Forums.Altenens.parser import * from Forums.Procrax.parser import * from Forums.Libre.parser import * +from Forums.HiddenAnswers.parser import * from Forums.Classifier.classify_product import predict # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi @@ -163,6 +164,8 @@ def new_parse(forum, url, createLog): rmm = procrax_description_parser(soup) elif forum == "Libre": rmm = libre_description_parser(soup) + elif forum == "HiddenAnswers": + rmm = HiddenAnswers_description_parser(soup) # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip() key = u"Url:" + os.path.basename(line2).replace(".html", "") @@ -248,6 +251,8 @@ def new_parse(forum, url, createLog): rw = procrax_listing_parser(soup) elif forum == "Libre": rw = libre_listing_parser(soup) + elif forum == "HiddenAnswers": + rw = HiddenAnswers_listing_parser(soup) except: