fixes to Incogsnoo

1 year ago · 69c9ee3689
--- a/Forums/Incogsnoo/crawler_selenium.py
+++ b/Forums/Incogsnoo/crawler_selenium.py
@ -159,7 +159,8 @@ def getInterestedLinks():

    # Malware
    links.append('http://tedditfyn6idalzso5wam5qd3kdtxoljjhbrbbx34q2xkcisvshuytad.onion/r/Malware')
    #
    # FIND MORE BOARDS

    return links


@ -210,9 +211,9 @@ def crawlForum(driver):

                        savePage(driver, driver.page_source, topic + f"page{counter}")  # very important

                        # comment out
                        if counter == 2:
                            break
                        # # comment out
                        # if counter == 2:
                        #     break

                        try:
                            # incogsnoo doesn't have next button to load more pages of the description
@ -232,12 +233,12 @@ def crawlForum(driver):
                    except:
                        driver.refresh()

                    # comment out
                    # break

                # comment out
                if count == 1:
                    break
                #     # comment out
                #     break
                #
                # # comment out
                # if count == 1:
                #     break

                try:
                    link_tag = driver.find_element(by=By.XPATH, value="/html/body/div[2]/div[last()]/a[contains(text(),'next')]")
--- a/Forums/Incogsnoo/parser.py
+++ b/Forums/Incogsnoo/parser.py
@ -81,21 +81,18 @@ def incogsnoo_description_parser(soup):
    dt = soup.find("p", {"class": "submitted"}).find("span")["title"]
    # Convert to datetime object - e.g. 2023-12-18 05:49:20
    date_time_obj = datetime.strptime(dt, '%a, %d %b %Y %H:%M:%S %Z')
    sdate = date_time_obj.strftime('%m %d %Y')
    stime = date_time_obj.strftime('%I:%M %p')
    # sdate = date_time_obj.strftime('%m %d %Y')
    # stime = date_time_obj.strftime('%I:%M %p')

    date = convertDate(sdate, "english", datetime.now()) + " " + stime
    # date = convertDate(sdate, "english", datetime.now()) + " " + stime
    # e.g. "12/18/2023 05:49 AM"
    addDate.append(date)
    addDate.append(date_time_obj)

    image_user.append("-1")
    image_post.append("-1")



    posts = soup.find("div", {"class": "comments"}).findAll("details")


    # For each message (post), get all the fields we are interested to:

    for ipost in posts:
@ -143,17 +140,16 @@ def incogsnoo_description_parser(soup):
        dt = ipost.find("p", {"class": "created"})["title"]
        # Convert to datetime object - e.g. 2023-12-18 05:49:20
        date_time_obj = datetime.strptime(dt, '%a, %d %b %Y %H:%M:%S %Z')
        sdate = date_time_obj.strftime('%m %d %Y')
        stime = date_time_obj.strftime('%I:%M %p')
        # sdate = date_time_obj.strftime('%m %d %Y')
        # stime = date_time_obj.strftime('%I:%M %p')

        date = convertDate(sdate, "english", datetime.now()) + " " + stime
        # date = convertDate(sdate, "english", datetime.now()) + " " + stime
        # e.g. "12/18/2023 05:49 AM"
        addDate.append(date)
        addDate.append(date_time_obj)

        image_user.append("-1")
        image_post.append("-1")


    # Populate the final variable (this should be a list with all fields scraped)

    row = (topic, user, status, reputation, interest, sign, post, feedback, addDate, image_user, image_post)
@ -163,7 +159,6 @@ def incogsnoo_description_parser(soup):
    return row



 # This is the method to parse the Listing Pages (one page with many posts)
 #parses listing pages, so takes html pages of listing pages using soup object, and parses it for info it needs
 #stores info it needs in different lists, these lists are returned after being organized
@ -223,18 +218,17 @@ def incogsnoo_listing_parser(soup):

        # Adding the url to the list of urls
        link = itopic.find("a", {"class": "comments"}).get("href")
        link = cleanLink(link)
        href.append(link)

        # Finding dates
        p_tag = itopic.find("p", {"class": "submitted"})
        dt = p_tag.find("span")["title"]
        date_time_obj = datetime.strptime(dt,'%a, %d %b %Y %H:%M:%S %Z')
        sdate = date_time_obj.strftime('%m %d %Y')
        stime = date_time_obj.strftime('%I:%M %p')
        date = convertDate(sdate, "english", datetime.now()) + " " + stime
        # sdate = date_time_obj.strftime('%m %d %Y')
        # stime = date_time_obj.strftime('%I:%M %p')
        # date = convertDate(sdate, "english", datetime.now()) + " " + stime
        # e.g. "12/18/2023 05:49 AM"
        addDate.append(date)
        addDate.append(date_time_obj)

        image_author.append("-1")

@ -255,7 +249,6 @@ def incogsnoo_links_parser(soup):
    listing_parent = soup.find("div", {"id": "links", "class": "sr"})
    listing = listing_parent.findAll("div", {"class": "entry"})

    count = 0
    for entry in listing:

        parent_div = entry.find("div", {"class": "meta"}).find("div", {"class", "links"})
@ -263,9 +256,4 @@ def incogsnoo_links_parser(soup):
        if a_tag:
            href.append(a_tag.get("href"))

        # if count == 10:
        #     break

        count += 1

    return href