|
@ -7,12 +7,14 @@ from datetime import timedelta |
|
|
import re |
|
|
import re |
|
|
|
|
|
|
|
|
# Here, we are importing BeautifulSoup to search through the HTML tree |
|
|
# Here, we are importing BeautifulSoup to search through the HTML tree |
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
|
|
|
from bs4 import BeautifulSoup, ResultSet, Tag |
|
|
|
|
|
|
|
|
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) |
|
|
# This is the method to parse the Description Pages (one page to each topic in the Listing Pages) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def libre_description_parser(soup): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def libre_description_parser(soup: Tag): |
|
|
# Fields to be parsed |
|
|
# Fields to be parsed |
|
|
|
|
|
|
|
|
topic = "-1" # 0 *topic name |
|
|
topic = "-1" # 0 *topic name |
|
@ -29,11 +31,34 @@ def libre_description_parser(soup): |
|
|
|
|
|
|
|
|
topic_found = soup.find("a", {"class": "link text-xl text-zinc-300"}).text |
|
|
topic_found = soup.find("a", {"class": "link text-xl text-zinc-300"}).text |
|
|
topic = cleanString(topic_found.strip()) |
|
|
topic = cleanString(topic_found.strip()) |
|
|
|
|
|
|
|
|
|
|
|
original_post: Tag = soup.find("div", {"class": "flex items-start"}) |
|
|
|
|
|
|
|
|
|
|
|
original_user = original_post.find("div", {"class": "info-p"}).find("a", {"class": "link"}).text |
|
|
|
|
|
user.append(cleanString(original_user.replace("/u/", "").strip())) |
|
|
|
|
|
|
|
|
|
|
|
original_user_statistics: ResultSet[Tag] = original_post.find("div", {"class": "info-p"}).find_all("span") |
|
|
|
|
|
|
|
|
|
|
|
original_time = original_user_statistics[0].text[2:] |
|
|
|
|
|
datetime_append = datetime.strptime(original_time, "%Y-%m-%d %H:%M:%S GMT") |
|
|
|
|
|
addDate.append(datetime_append) |
|
|
|
|
|
|
|
|
|
|
|
original_karma = original_user_statistics[1].text[2] |
|
|
|
|
|
reputation.append(cleanString(original_karma.strip())) |
|
|
|
|
|
|
|
|
|
|
|
original_content = soup.find("div", {"class": "content-p"}).text |
|
|
|
|
|
post.append(cleanString(original_content.strip())) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
status.append("-1") |
|
|
|
|
|
interest.append("-1") |
|
|
|
|
|
sign.append("-1") |
|
|
|
|
|
feedback.append("-1") |
|
|
|
|
|
|
|
|
# Finding the repeated tag that corresponds to the listing of posts |
|
|
# Finding the repeated tag that corresponds to the listing of posts |
|
|
|
|
|
|
|
|
# try: |
|
|
# try: |
|
|
posts = soup.find_all("div", {"class": "flex items-stretch"}) |
|
|
|
|
|
|
|
|
posts: ResultSet[Tag] = soup.find_all("div", {"class": "flex items-stretch"}) |
|
|
|
|
|
|
|
|
# For each message (post), get all the fields we are interested to: |
|
|
# For each message (post), get all the fields we are interested to: |
|
|
|
|
|
|
|
@ -42,9 +67,8 @@ def libre_description_parser(soup): |
|
|
|
|
|
|
|
|
# Finding the author (user) of the post |
|
|
# Finding the author (user) of the post |
|
|
|
|
|
|
|
|
user_name = ipost.find("a", {"class": "link"}).text |
|
|
|
|
|
user_name_cleaned = user_name.split("/")[1] |
|
|
|
|
|
user.append(cleanString(user_name_cleaned)) # Remember to clean the problematic characters |
|
|
|
|
|
|
|
|
user_name = ipost.find("p", {"class": "text-zinc-400 text-justify"}).find("a", {"class": "link"}).text |
|
|
|
|
|
user.append(cleanString(user_name.replace("/u/", "").strip())) # Remember to clean the problematic characters |
|
|
|
|
|
|
|
|
status.append("-1") |
|
|
status.append("-1") |
|
|
|
|
|
|
|
@ -64,6 +88,7 @@ def libre_description_parser(soup): |
|
|
|
|
|
|
|
|
date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text |
|
|
date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text |
|
|
date_time_cleaned = date_posted.replace(user_name, "")[3:-12] |
|
|
date_time_cleaned = date_posted.replace(user_name, "")[3:-12] |
|
|
|
|
|
print(date_time_cleaned) |
|
|
datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT") |
|
|
datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT") |
|
|
addDate.append(datetime_append) |
|
|
addDate.append(datetime_append) |
|
|
|
|
|
|
|
@ -73,8 +98,6 @@ def libre_description_parser(soup): |
|
|
|
|
|
|
|
|
# Finding the user's signature |
|
|
# Finding the user's signature |
|
|
|
|
|
|
|
|
# signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) |
|
|
|
|
|
|
|
|
|
|
|
sign.append("-1") |
|
|
sign.append("-1") |
|
|
|
|
|
|
|
|
# As no information about user's feedback was found, just assign "-1" to the variable |
|
|
# As no information about user's feedback was found, just assign "-1" to the variable |
|
|