Browse Source

fixes to Incogsnoo

main
westernmeadow 11 months ago
parent
commit
69c9ee3689
2 changed files with 23 additions and 34 deletions
  1. +11
    -10
      Forums/Incogsnoo/crawler_selenium.py
  2. +12
    -24
      Forums/Incogsnoo/parser.py

+ 11
- 10
Forums/Incogsnoo/crawler_selenium.py View File

@ -159,7 +159,8 @@ def getInterestedLinks():
# Malware
links.append('http://tedditfyn6idalzso5wam5qd3kdtxoljjhbrbbx34q2xkcisvshuytad.onion/r/Malware')
#
# FIND MORE BOARDS
return links
@ -210,9 +211,9 @@ def crawlForum(driver):
savePage(driver, driver.page_source, topic + f"page{counter}") # very important
# comment out
if counter == 2:
break
# # comment out
# if counter == 2:
# break
try:
# incogsnoo doesn't have next button to load more pages of the description
@ -232,12 +233,12 @@ def crawlForum(driver):
except:
driver.refresh()
# comment out
# break
# comment out
if count == 1:
break
# # comment out
# break
#
# # comment out
# if count == 1:
# break
try:
link_tag = driver.find_element(by=By.XPATH, value="/html/body/div[2]/div[last()]/a[contains(text(),'next')]")


+ 12
- 24
Forums/Incogsnoo/parser.py View File

@ -81,21 +81,18 @@ def incogsnoo_description_parser(soup):
dt = soup.find("p", {"class": "submitted"}).find("span")["title"]
# Convert to datetime object - e.g. 2023-12-18 05:49:20
date_time_obj = datetime.strptime(dt, '%a, %d %b %Y %H:%M:%S %Z')
sdate = date_time_obj.strftime('%m %d %Y')
stime = date_time_obj.strftime('%I:%M %p')
# sdate = date_time_obj.strftime('%m %d %Y')
# stime = date_time_obj.strftime('%I:%M %p')
date = convertDate(sdate, "english", datetime.now()) + " " + stime
# date = convertDate(sdate, "english", datetime.now()) + " " + stime
# e.g. "12/18/2023 05:49 AM"
addDate.append(date)
addDate.append(date_time_obj)
image_user.append("-1")
image_post.append("-1")
posts = soup.find("div", {"class": "comments"}).findAll("details")
# For each message (post), get all the fields we are interested to:
for ipost in posts:
@ -143,17 +140,16 @@ def incogsnoo_description_parser(soup):
dt = ipost.find("p", {"class": "created"})["title"]
# Convert to datetime object - e.g. 2023-12-18 05:49:20
date_time_obj = datetime.strptime(dt, '%a, %d %b %Y %H:%M:%S %Z')
sdate = date_time_obj.strftime('%m %d %Y')
stime = date_time_obj.strftime('%I:%M %p')
# sdate = date_time_obj.strftime('%m %d %Y')
# stime = date_time_obj.strftime('%I:%M %p')
date = convertDate(sdate, "english", datetime.now()) + " " + stime
# date = convertDate(sdate, "english", datetime.now()) + " " + stime
# e.g. "12/18/2023 05:49 AM"
addDate.append(date)
addDate.append(date_time_obj)
image_user.append("-1")
image_post.append("-1")
# Populate the final variable (this should be a list with all fields scraped)
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
@ -163,7 +159,6 @@ def incogsnoo_description_parser(soup):
return row
# This is the method to parse the Listing Pages (one page with many posts)
#parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
#stores info it needs in different lists, these lists are returned after being organized
@ -223,18 +218,17 @@ def incogsnoo_listing_parser(soup):
# Adding the url to the list of urls
link = itopic.find("a", {"class": "comments"}).get("href")
link = cleanLink(link)
href.append(link)
# Finding dates
p_tag = itopic.find("p", {"class": "submitted"})
dt = p_tag.find("span")["title"]
date_time_obj = datetime.strptime(dt,'%a, %d %b %Y %H:%M:%S %Z')
sdate = date_time_obj.strftime('%m %d %Y')
stime = date_time_obj.strftime('%I:%M %p')
date = convertDate(sdate, "english", datetime.now()) + " " + stime
# sdate = date_time_obj.strftime('%m %d %Y')
# stime = date_time_obj.strftime('%I:%M %p')
# date = convertDate(sdate, "english", datetime.now()) + " " + stime
# e.g. "12/18/2023 05:49 AM"
addDate.append(date)
addDate.append(date_time_obj)
image_author.append("-1")
@ -255,7 +249,6 @@ def incogsnoo_links_parser(soup):
listing_parent = soup.find("div", {"id": "links", "class": "sr"})
listing = listing_parent.findAll("div", {"class": "entry"})
count = 0
for entry in listing:
parent_div = entry.find("div", {"class": "meta"}).find("div", {"class", "links"})
@ -263,9 +256,4 @@ def incogsnoo_links_parser(soup):
if a_tag:
href.append(a_tag.get("href"))
# if count == 10:
# break
count += 1
return href

Loading…
Cancel
Save