From 709ca196cc642d8c6f6bd769c270ea59e9e53b12 Mon Sep 17 00:00:00 2001 From: Khoi Date: Thu, 20 Jul 2023 15:10:39 -0700 Subject: [PATCH] Completed and tested parsers for Libre --- Forums/Libre/parser.py | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/Forums/Libre/parser.py b/Forums/Libre/parser.py index 1991d7a..c951ad5 100644 --- a/Forums/Libre/parser.py +++ b/Forums/Libre/parser.py @@ -7,12 +7,14 @@ from datetime import timedelta import re # Here, we are importing BeautifulSoup to search through the HTML tree -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, ResultSet, Tag # This is the method to parse the Description Pages (one page to each topic in the Listing Pages) -def libre_description_parser(soup): + + +def libre_description_parser(soup: Tag): # Fields to be parsed topic = "-1" # 0 *topic name @@ -29,11 +31,34 @@ def libre_description_parser(soup): topic_found = soup.find("a", {"class": "link text-xl text-zinc-300"}).text topic = cleanString(topic_found.strip()) + + original_post: Tag = soup.find("div", {"class": "flex items-start"}) + + original_user = original_post.find("div", {"class": "info-p"}).find("a", {"class": "link"}).text + user.append(cleanString(original_user.replace("/u/", "").strip())) + + original_user_statistics: ResultSet[Tag] = original_post.find("div", {"class": "info-p"}).find_all("span") + + original_time = original_user_statistics[0].text[2:] + datetime_append = datetime.strptime(original_time, "%Y-%m-%d %H:%M:%S GMT") + addDate.append(datetime_append) + + original_karma = original_user_statistics[1].text[2] + reputation.append(cleanString(original_karma.strip())) + + original_content = soup.find("div", {"class": "content-p"}).text + post.append(cleanString(original_content.strip())) + + + status.append("-1") + interest.append("-1") + sign.append("-1") + feedback.append("-1") # Finding the repeated tag that corresponds to the listing of posts # try: - posts = soup.find_all("div", {"class": "flex items-stretch"}) + posts: ResultSet[Tag] = soup.find_all("div", {"class": "flex items-stretch"}) # For each message (post), get all the fields we are interested to: @@ -42,9 +67,8 @@ def libre_description_parser(soup): # Finding the author (user) of the post - user_name = ipost.find("a", {"class": "link"}).text - user_name_cleaned = user_name.split("/")[1] - user.append(cleanString(user_name_cleaned)) # Remember to clean the problematic characters + user_name = ipost.find("p", {"class": "text-zinc-400 text-justify"}).find("a", {"class": "link"}).text + user.append(cleanString(user_name.replace("/u/", "").strip())) # Remember to clean the problematic characters status.append("-1") @@ -64,6 +88,7 @@ def libre_description_parser(soup): date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text date_time_cleaned = date_posted.replace(user_name, "")[3:-12] + print(date_time_cleaned) datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT") addDate.append(datetime_append) @@ -73,8 +98,6 @@ def libre_description_parser(soup): # Finding the user's signature - # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) - sign.append("-1") # As no information about user's feedback was found, just assign "-1" to the variable