diff --git a/Forums/Incogsnoo/crawler_selenium.py b/Forums/Incogsnoo/crawler_selenium.py index fd8b92f..eeaf766 100644 --- a/Forums/Incogsnoo/crawler_selenium.py +++ b/Forums/Incogsnoo/crawler_selenium.py @@ -159,7 +159,8 @@ def getInterestedLinks(): # Malware links.append('http://tedditfyn6idalzso5wam5qd3kdtxoljjhbrbbx34q2xkcisvshuytad.onion/r/Malware') - # + # FIND MORE BOARDS + return links @@ -210,9 +211,9 @@ def crawlForum(driver): savePage(driver, driver.page_source, topic + f"page{counter}") # very important - # comment out - if counter == 2: - break + # # comment out + # if counter == 2: + # break try: # incogsnoo doesn't have next button to load more pages of the description @@ -232,12 +233,12 @@ def crawlForum(driver): except: driver.refresh() - # comment out - # break - - # comment out - if count == 1: - break + # # comment out + # break + # + # # comment out + # if count == 1: + # break try: link_tag = driver.find_element(by=By.XPATH, value="/html/body/div[2]/div[last()]/a[contains(text(),'next')]") diff --git a/Forums/Incogsnoo/parser.py b/Forums/Incogsnoo/parser.py index b24caa8..86ab8fc 100644 --- a/Forums/Incogsnoo/parser.py +++ b/Forums/Incogsnoo/parser.py @@ -81,21 +81,18 @@ def incogsnoo_description_parser(soup): dt = soup.find("p", {"class": "submitted"}).find("span")["title"] # Convert to datetime object - e.g. 2023-12-18 05:49:20 date_time_obj = datetime.strptime(dt, '%a, %d %b %Y %H:%M:%S %Z') - sdate = date_time_obj.strftime('%m %d %Y') - stime = date_time_obj.strftime('%I:%M %p') + # sdate = date_time_obj.strftime('%m %d %Y') + # stime = date_time_obj.strftime('%I:%M %p') - date = convertDate(sdate, "english", datetime.now()) + " " + stime + # date = convertDate(sdate, "english", datetime.now()) + " " + stime # e.g. "12/18/2023 05:49 AM" - addDate.append(date) + addDate.append(date_time_obj) image_user.append("-1") image_post.append("-1") - - posts = soup.find("div", {"class": "comments"}).findAll("details") - # For each message (post), get all the fields we are interested to: for ipost in posts: @@ -143,17 +140,16 @@ def incogsnoo_description_parser(soup): dt = ipost.find("p", {"class": "created"})["title"] # Convert to datetime object - e.g. 2023-12-18 05:49:20 date_time_obj = datetime.strptime(dt, '%a, %d %b %Y %H:%M:%S %Z') - sdate = date_time_obj.strftime('%m %d %Y') - stime = date_time_obj.strftime('%I:%M %p') + # sdate = date_time_obj.strftime('%m %d %Y') + # stime = date_time_obj.strftime('%I:%M %p') - date = convertDate(sdate, "english", datetime.now()) + " " + stime + # date = convertDate(sdate, "english", datetime.now()) + " " + stime # e.g. "12/18/2023 05:49 AM" - addDate.append(date) + addDate.append(date_time_obj) image_user.append("-1") image_post.append("-1") - # Populate the final variable (this should be a list with all fields scraped) row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post) @@ -163,7 +159,6 @@ def incogsnoo_description_parser(soup): return row - # This is the method to parse the Listing Pages (one page with many posts) #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs #stores info it needs in different lists, these lists are returned after being organized @@ -223,18 +218,17 @@ def incogsnoo_listing_parser(soup): # Adding the url to the list of urls link = itopic.find("a", {"class": "comments"}).get("href") - link = cleanLink(link) href.append(link) # Finding dates p_tag = itopic.find("p", {"class": "submitted"}) dt = p_tag.find("span")["title"] date_time_obj = datetime.strptime(dt,'%a, %d %b %Y %H:%M:%S %Z') - sdate = date_time_obj.strftime('%m %d %Y') - stime = date_time_obj.strftime('%I:%M %p') - date = convertDate(sdate, "english", datetime.now()) + " " + stime + # sdate = date_time_obj.strftime('%m %d %Y') + # stime = date_time_obj.strftime('%I:%M %p') + # date = convertDate(sdate, "english", datetime.now()) + " " + stime # e.g. "12/18/2023 05:49 AM" - addDate.append(date) + addDate.append(date_time_obj) image_author.append("-1") @@ -255,7 +249,6 @@ def incogsnoo_links_parser(soup): listing_parent = soup.find("div", {"id": "links", "class": "sr"}) listing = listing_parent.findAll("div", {"class": "entry"}) - count = 0 for entry in listing: parent_div = entry.find("div", {"class": "meta"}).find("div", {"class", "links"}) @@ -263,9 +256,4 @@ def incogsnoo_links_parser(soup): if a_tag: href.append(a_tag.get("href")) - # if count == 10: - # break - - count += 1 - return href \ No newline at end of file