Completed and tested parsers for Libre

2 years ago · 709ca196cc
--- a/Forums/Libre/parser.py
+++ b/Forums/Libre/parser.py
@ -7,12 +7,14 @@ from datetime import timedelta
 import re

 # Here, we are importing BeautifulSoup to search through the HTML tree
 from bs4 import BeautifulSoup
 from bs4 import BeautifulSoup, ResultSet, Tag

 # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)


 def libre_description_parser(soup):


 def libre_description_parser(soup: Tag):
    # Fields to be parsed

    topic = "-1"  # 0 *topic name
@ -29,11 +31,34 @@ def libre_description_parser(soup):

    topic_found = soup.find("a", {"class": "link text-xl text-zinc-300"}).text
    topic = cleanString(topic_found.strip())
    
    original_post: Tag = soup.find("div", {"class": "flex items-start"})
    
    original_user = original_post.find("div", {"class": "info-p"}).find("a", {"class": "link"}).text
    user.append(cleanString(original_user.replace("/u/", "").strip()))
    
    original_user_statistics: ResultSet[Tag] = original_post.find("div", {"class": "info-p"}).find_all("span")
    
    original_time = original_user_statistics[0].text[2:]
    datetime_append = datetime.strptime(original_time, "%Y-%m-%d %H:%M:%S GMT")
    addDate.append(datetime_append)
    
    original_karma = original_user_statistics[1].text[2]
    reputation.append(cleanString(original_karma.strip()))
    
    original_content = soup.find("div", {"class": "content-p"}).text
    post.append(cleanString(original_content.strip()))
    

    status.append("-1")
    interest.append("-1")
    sign.append("-1")
    feedback.append("-1")

    # Finding the repeated tag that corresponds to the listing of posts

    # try:
    posts = soup.find_all("div", {"class": "flex items-stretch"})
    posts: ResultSet[Tag] = soup.find_all("div", {"class": "flex items-stretch"})

    # For each message (post), get all the fields we are interested to:

@ -42,9 +67,8 @@ def libre_description_parser(soup):

        # Finding the author (user) of the post

        user_name = ipost.find("a", {"class": "link"}).text
        user_name_cleaned = user_name.split("/")[1]
        user.append(cleanString(user_name_cleaned))  # Remember to clean the problematic characters
        user_name = ipost.find("p", {"class": "text-zinc-400 text-justify"}).find("a", {"class": "link"}).text
        user.append(cleanString(user_name.replace("/u/", "").strip()))  # Remember to clean the problematic characters

        status.append("-1")

@ -64,6 +88,7 @@ def libre_description_parser(soup):

        date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
        date_time_cleaned = date_posted.replace(user_name, "")[3:-12]
        print(date_time_cleaned)
        datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT")
        addDate.append(datetime_append)

@ -73,8 +98,6 @@ def libre_description_parser(soup):

        # Finding the user's signature

        # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})

        sign.append("-1")

        # As no information about user's feedback was found, just assign "-1" to the variable