|
|
@ -81,21 +81,18 @@ def incogsnoo_description_parser(soup): |
|
|
|
dt = soup.find("p", {"class": "submitted"}).find("span")["title"] |
|
|
|
# Convert to datetime object - e.g. 2023-12-18 05:49:20 |
|
|
|
date_time_obj = datetime.strptime(dt, '%a, %d %b %Y %H:%M:%S %Z') |
|
|
|
sdate = date_time_obj.strftime('%m %d %Y') |
|
|
|
stime = date_time_obj.strftime('%I:%M %p') |
|
|
|
# sdate = date_time_obj.strftime('%m %d %Y') |
|
|
|
# stime = date_time_obj.strftime('%I:%M %p') |
|
|
|
|
|
|
|
date = convertDate(sdate, "english", datetime.now()) + " " + stime |
|
|
|
# date = convertDate(sdate, "english", datetime.now()) + " " + stime |
|
|
|
# e.g. "12/18/2023 05:49 AM" |
|
|
|
addDate.append(date) |
|
|
|
addDate.append(date_time_obj) |
|
|
|
|
|
|
|
image_user.append("-1") |
|
|
|
image_post.append("-1") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
posts = soup.find("div", {"class": "comments"}).findAll("details") |
|
|
|
|
|
|
|
|
|
|
|
# For each message (post), get all the fields we are interested to: |
|
|
|
|
|
|
|
for ipost in posts: |
|
|
@ -143,17 +140,16 @@ def incogsnoo_description_parser(soup): |
|
|
|
dt = ipost.find("p", {"class": "created"})["title"] |
|
|
|
# Convert to datetime object - e.g. 2023-12-18 05:49:20 |
|
|
|
date_time_obj = datetime.strptime(dt, '%a, %d %b %Y %H:%M:%S %Z') |
|
|
|
sdate = date_time_obj.strftime('%m %d %Y') |
|
|
|
stime = date_time_obj.strftime('%I:%M %p') |
|
|
|
# sdate = date_time_obj.strftime('%m %d %Y') |
|
|
|
# stime = date_time_obj.strftime('%I:%M %p') |
|
|
|
|
|
|
|
date = convertDate(sdate, "english", datetime.now()) + " " + stime |
|
|
|
# date = convertDate(sdate, "english", datetime.now()) + " " + stime |
|
|
|
# e.g. "12/18/2023 05:49 AM" |
|
|
|
addDate.append(date) |
|
|
|
addDate.append(date_time_obj) |
|
|
|
|
|
|
|
image_user.append("-1") |
|
|
|
image_post.append("-1") |
|
|
|
|
|
|
|
|
|
|
|
# Populate the final variable (this should be a list with all fields scraped) |
|
|
|
|
|
|
|
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) |
|
|
@ -163,7 +159,6 @@ def incogsnoo_description_parser(soup): |
|
|
|
return row |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# This is the method to parse the Listing Pages (one page with many posts) |
|
|
|
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs |
|
|
|
#stores info it needs in different lists, these lists are returned after being organized |
|
|
@ -223,18 +218,17 @@ def incogsnoo_listing_parser(soup): |
|
|
|
|
|
|
|
# Adding the url to the list of urls |
|
|
|
link = itopic.find("a", {"class": "comments"}).get("href") |
|
|
|
link = cleanLink(link) |
|
|
|
href.append(link) |
|
|
|
|
|
|
|
# Finding dates |
|
|
|
p_tag = itopic.find("p", {"class": "submitted"}) |
|
|
|
dt = p_tag.find("span")["title"] |
|
|
|
date_time_obj = datetime.strptime(dt,'%a, %d %b %Y %H:%M:%S %Z') |
|
|
|
sdate = date_time_obj.strftime('%m %d %Y') |
|
|
|
stime = date_time_obj.strftime('%I:%M %p') |
|
|
|
date = convertDate(sdate, "english", datetime.now()) + " " + stime |
|
|
|
# sdate = date_time_obj.strftime('%m %d %Y') |
|
|
|
# stime = date_time_obj.strftime('%I:%M %p') |
|
|
|
# date = convertDate(sdate, "english", datetime.now()) + " " + stime |
|
|
|
# e.g. "12/18/2023 05:49 AM" |
|
|
|
addDate.append(date) |
|
|
|
addDate.append(date_time_obj) |
|
|
|
|
|
|
|
image_author.append("-1") |
|
|
|
|
|
|
@ -255,7 +249,6 @@ def incogsnoo_links_parser(soup): |
|
|
|
listing_parent = soup.find("div", {"id": "links", "class": "sr"}) |
|
|
|
listing = listing_parent.findAll("div", {"class": "entry"}) |
|
|
|
|
|
|
|
count = 0 |
|
|
|
for entry in listing: |
|
|
|
|
|
|
|
parent_div = entry.find("div", {"class": "meta"}).find("div", {"class", "links"}) |
|
|
@ -263,9 +256,4 @@ def incogsnoo_links_parser(soup): |
|
|
|
if a_tag: |
|
|
|
href.append(a_tag.get("href")) |
|
|
|
|
|
|
|
# if count == 10: |
|
|
|
# break |
|
|
|
|
|
|
|
count += 1 |
|
|
|
|
|
|
|
return href |