Browse Source

Completed and tested parsers for Libre

main
Khoi 1 year ago
parent
commit
709ca196cc
1 changed files with 31 additions and 8 deletions
  1. +31
    -8
      Forums/Libre/parser.py

+ 31
- 8
Forums/Libre/parser.py View File

@ -7,12 +7,14 @@ from datetime import timedelta
import re
# Here, we are importing BeautifulSoup to search through the HTML tree
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, ResultSet, Tag
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
def libre_description_parser(soup):
def libre_description_parser(soup: Tag):
# Fields to be parsed
topic = "-1" # 0 *topic name
@ -29,11 +31,34 @@ def libre_description_parser(soup):
topic_found = soup.find("a", {"class": "link text-xl text-zinc-300"}).text
topic = cleanString(topic_found.strip())
original_post: Tag = soup.find("div", {"class": "flex items-start"})
original_user = original_post.find("div", {"class": "info-p"}).find("a", {"class": "link"}).text
user.append(cleanString(original_user.replace("/u/", "").strip()))
original_user_statistics: ResultSet[Tag] = original_post.find("div", {"class": "info-p"}).find_all("span")
original_time = original_user_statistics[0].text[2:]
datetime_append = datetime.strptime(original_time, "%Y-%m-%d %H:%M:%S GMT")
addDate.append(datetime_append)
original_karma = original_user_statistics[1].text[2]
reputation.append(cleanString(original_karma.strip()))
original_content = soup.find("div", {"class": "content-p"}).text
post.append(cleanString(original_content.strip()))
status.append("-1")
interest.append("-1")
sign.append("-1")
feedback.append("-1")
# Finding the repeated tag that corresponds to the listing of posts
# try:
posts = soup.find_all("div", {"class": "flex items-stretch"})
posts: ResultSet[Tag] = soup.find_all("div", {"class": "flex items-stretch"})
# For each message (post), get all the fields we are interested to:
@ -42,9 +67,8 @@ def libre_description_parser(soup):
# Finding the author (user) of the post
user_name = ipost.find("a", {"class": "link"}).text
user_name_cleaned = user_name.split("/")[1]
user.append(cleanString(user_name_cleaned)) # Remember to clean the problematic characters
user_name = ipost.find("p", {"class": "text-zinc-400 text-justify"}).find("a", {"class": "link"}).text
user.append(cleanString(user_name.replace("/u/", "").strip())) # Remember to clean the problematic characters
status.append("-1")
@ -64,6 +88,7 @@ def libre_description_parser(soup):
date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
date_time_cleaned = date_posted.replace(user_name, "")[3:-12]
print(date_time_cleaned)
datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT")
addDate.append(datetime_append)
@ -73,8 +98,6 @@ def libre_description_parser(soup):
# Finding the user's signature
# signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
sign.append("-1")
# As no information about user's feedback was found, just assign "-1" to the variable


Loading…
Cancel
Save