completed Libre forum (not tested yet)

2 years ago · 6e1a1b6b2a
--- a/Forums/Initialization/forumsList.txt
+++ b/Forums/Initialization/forumsList.txt
@ -1 +1 @@
 Cardingleaks
 Libre
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@ -11,6 +11,7 @@ from Forums.CryptBB.parser import *
 from Forums.OnniForums.parser import *
 from Forums.Altenens.parser import *
 from Forums.Procrax.parser import *
 from Forums.Libre.parser import *
 from Forums.Classifier.classify_product import predict
 # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -160,6 +161,8 @@ def new_parse(forum, url, createLog):
                rmm = altenens_description_parser(soup)
            elif forum == "Procrax":
                rmm = procrax_description_parser(soup)
            elif forum == "Libre":
                rmm = libre_description_parser(soup)
            # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
            key = u"Url:" + os.path.basename(line2).replace(".html", "")
@ -243,6 +246,8 @@ def new_parse(forum, url, createLog):
                    rw = altenens_listing_parser(soup)
                elif forum == "Procrax":
                    rw = procrax_listing_parser(soup)
                elif forum == "Libre":
                    rw = libre_listing_parser(soup)
            except:
--- a/Forums/Libre/crawler_selenium.py
+++ b/Forums/Libre/crawler_selenium.py
@ -28,17 +28,17 @@ baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion
 # Opens Tor Browser, crawls the website
 def startCrawling():
    opentor()
    # opentor()
    forumName = getForumName()
    driver = getAccess()
    # driver = getAccess()
    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)
    new_parse(forumName, baseURL, False)
--- a/Forums/Libre/parser.py
+++ b/Forums/Libre/parser.py
@ -12,150 +12,92 @@ from bs4 import BeautifulSoup
 # This is the method to parse the Description Pages (one page to each topic in the Listing Pages)
 def cryptBB_description_parser(soup):
 def libre_description_parser(soup):
    # Fields to be parsed
    topic = "-1"            # 0 *topic name
    user = []               # 1 *all users of each post
    status = []             # 2 all user's authority in each post such as (adm, member, dangerous)
    reputation = []         # 3 all user's karma in each post (usually found as a number)
    interest = []           # 4 all user's interest in each post
    sign = []               # 5 all user's signature in each post (usually a standard message after the content of the post)
    post = []               # 6 all messages of each post
    feedback = []           # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
    addDate = []            # 8 all dates of each post
    topic = "-1"  # 0 *topic name
    user = []  # 1 *all users of each post
    status = []  # 2 all user's authority in each post such as (adm, member, dangerous)
    reputation = []  # 3 all user's karma in each post (usually found as a number)
    interest = []  # 4 all user's interest in each post
    sign = []  # 5 all user's signature in each post (usually a standard message after the content of the post)
    post = []  # 6 all messages of each post
    feedback = []  # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format)
    addDate = []  # 8 all dates of each post
    # Finding the topic (should be just one coming from the Listing Page)
    li = soup.find("td", {"class": "thead"}).find('strong')
    topic = li.text
    topic = re.sub("\[\w*\]", '', topic)
    topic = topic.replace(",","")
    topic = topic.replace("\n","")
    topic = cleanString(topic.strip())
    topic_found = soup.find("a", {"class": "link text-xl text-zinc-300"}).text
    topic = cleanString(topic_found.strip())
    # Finding the repeated tag that corresponds to the listing of posts
    # try:
    posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all(
        'div', {"class": "post"})
    posts = soup.find_all("div", {"class": "flex items-stretch"})
    # For each message (post), get all the fields we are interested to:
    for ipost in posts:
        # Finding a first level of the HTML page
        post_wrapper = ipost.find('span', {"class": "largetext"})
        # Finding the author (user) of the post
        author = post_wrapper.text.strip()
        user.append(cleanString(author))  # Remember to clean the problematic characters
        # Finding the status of the author
        smalltext = ipost.find('div', {"class": "post_author"})
        '''
        # Testing here two possibilities to find this status and combine them
        if ipost.find('div', {"class": "deleted_post_author"}):
            status.append(-1)
            interest.append(-1)
            reputation.append(-1)
            addDate.append(-1)
            post.append("THIS POST HAS BEEN REMOVED!")
            sign.append(-1)
            feedback.append(-1)
            continue
        '''
        # CryptBB does have membergroup and postgroup
        membergroup = smalltext.find('div', {"class": "profile-rank"})
        postgroup = smalltext.find('div', {"class": "postgroup"})
        if membergroup != None:
            membergroup = membergroup.text.strip()
            if postgroup != None:
                postgroup = postgroup.text.strip()
                membergroup = membergroup + " - " + postgroup
        else:
            if postgroup != None:
                membergroup = postgroup.text.strip()
            else:
                membergroup = "-1"
        status.append(cleanString(membergroup))
        user_name = ipost.find("a", {"class": "link"}).text
        user_name_cleaned = user_name.split("/")[1]
        user.append(cleanString(user_name_cleaned))  # Remember to clean the problematic characters
        status.append("-1")
        # Finding the interest of the author
        # CryptBB does not have blurb
        blurb = smalltext.find('li', {"class": "blurb"})
        if blurb != None:
            blurb = blurb.text.strip()
        else:
            blurb = "-1"
        interest.append(cleanString(blurb))
        interest.append("-1")
        # Finding the reputation of the user
        # CryptBB does have reputation
        author_stats = smalltext.find('div', {"class": "author_statistics"})
        karma = author_stats.find('strong')
        if karma != None:
            karma = karma.text
            karma = karma.replace("Community Rating: ", "")
            karma = karma.replace("Karma: ", "")
            karma = karma.strip()
        else:
            karma = "-1"
        reputation.append(cleanString(karma))
        karma = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
        karma_cleaned = karma.split(" ")[6]
        reputation.append(cleanString(karma_cleaned.strip()))
        # Getting here another good tag to find the post date, post content and users' signature
        postarea = ipost.find('div', {"class": "post_content"})
        dt = postarea.find('span', {"class": "post_date"}).text
        # dt = dt.strip().split()
        dt = dt.strip()
        day=date.today()
        if "Yesterday" in dt:
            yesterday = day - timedelta(days=1)
            yesterday = yesterday.strftime('%m-%d-%Y')
            stime = dt.replace('Yesterday,','').strip()
            date_time_obj = yesterday+ ', '+stime
            date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p')
        elif "hours ago" in dt:
            day = day.strftime('%m-%d-%Y')
            date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title']
            date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p')
        else:
            date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
            stime = date_time_obj.strftime('%b %d, %Y')
            sdate = date_time_obj.strftime('%I:%M %p')
        addDate.append(date_time_obj)
        date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text
        date_time_cleaned = date_posted.replace(user_name, "")[3:-12]
        datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT")
        addDate.append(datetime_append)
        # Finding the post
        inner = postarea.find('div', {"class": "post_body scaleimages"})
        inner = inner.text.strip()
        post.append(cleanString(inner))
        user_post = ipost.find("div", {"class": "content-c"}).text
        post.append(cleanString(user_post))
        # Finding the user's signature
        # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"})
        signature = ipost.find('div', {"class": "signature scaleimages"})
        if signature != None:
            signature = signature.text.strip()
            # print(signature)
        else:
            signature = "-1"
        sign.append(cleanString(signature))
        sign.append("-1")
        # As no information about user's feedback was found, just assign "-1" to the variable
        feedback.append("-1")
    # Populate the final variable (this should be a list with all fields scraped)
    # print(topic)
    # print(user)
    # print(status)
    # print(reputation)
    # print(interest)
    # print(sign)
    # print(post)
    # print(feedback)
    # print(addDate)
    # print(len(user))
    # print(len(status))
    # print(len(reputation))
    # print(len(interest))
    # print(len(sign))
    # print(len(feedback))
    # print(len(addDate))
    row = (topic, user, status, reputation, interest, sign, post, feedback, addDate)
@ -163,76 +105,94 @@ def cryptBB_description_parser(soup):
    return row
 # This is the method to parse the Listing Pages (one page with many posts)
 def cryptBB_listing_parser(soup):
 # This is the method to parse the Listing Pages (one page with many posts)
    nm = 0              # *this variable should receive the number of topics
    forum = "OnniForums"   # 0 *forum name
    board = "-1"        # 1 *board name (the previous level of the topic in the Forum categorization tree.
                        # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
    author = []         # 2 *all authors of each topic
    topic = []          # 3 *all topics
    views = []          # 4 number of views of each topic
    posts = []          # 5 number of posts of each topic
    href = []           # 6 this variable should receive all cleaned urls (we will use this to do the marge between
                        # Listing and Description pages)
    addDate = []        # 7 when the topic was created (difficult to find)
 def libre_listing_parser(soup):
    nm = 0  # *this variable should receive the number of topics
    forum = "Libre"  # 0 *forum name
    board = "-1"  # 1 *board name (the previous level of the topic in the Forum categorization tree.
    # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware)
    author = []  # 2 *all authors of each topic
    topic = []  # 3 *all topics
    views = []  # 4 number of views of each topic
    posts = []  # 5 number of posts of each topic
    href = []  # 6 this variable should receive all cleaned urls (we will use this to do the marge between
    # Listing and Description pages)
    addDate = []  # 7 when the topic was created (difficult to find)
    # Finding the board (should be just one)
    board = soup.find('span', {"class": "active"}).text
    board = soup.find('div', {"class": "title"}).find("h1").text
    board = cleanString(board.strip())
    # Finding the repeated tag that corresponds to the listing of topics
    itopics = soup.find_all('tr',{"class": "inline_row"})
    itopics = soup.find("div", {"class", "space-y-2 mt-4"}).find_all('div', {"class": "flex box"})
    nm = 0
    for itopic in itopics:
        nm += 1
        # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them
        # to don't miss any topic
        # Adding the topic to the topic list
        try:
            topics = itopic.find('span', {"class": "subject_old"}).find('a').text
        except:
            topics = itopic.find('span', {"class": "subject_new"}).find('a').text
        topics = re.sub("\[\w*\]", '', topics)
        topic.append(cleanString(topics))
        # Counting how many topics we have found so far
        nm = len(topic)
        topic_string = itopic.find("a", {"class": "link text-xl text-zinc-300"}).text
        cleaned_topic_string = cleanString(topic_string.strip())
        topic.append(cleaned_topic_string)
        # Adding the url to the list of urls
        try:
            link = itopic.find('span', {"class": "subject_old"}).find('a').get('href')
        except:
            link = itopic.find('span',{"class": "subject_new"}).find('a').get('href')
        href.append(link)
        link_to_clean = itopic.find("a", {"class": "link text-xl text-zinc-300"}).get("href")
        # Finding the author of the topic
        ps = itopic.find('div', {"class":"author smalltext"}).find('a').text
        user = ps.strip()
        author.append(cleanString(user))
        href.append(link_to_clean)
        # Finding the number of replies
        columns = itopic.findChildren('td',recursive=False)
        replies = columns[3].text
        # Finding the author of the topic
        username_not_cleaned = itopic.find('div', {"class": "flex-grow p-2 text-justify"}).find('a').text
        username_cleaned = username_not_cleaned.split("/")[-1]
        author.append(cleanString(username_cleaned))
        posts.append(cleanString(replies))
        # Finding the number of views
        num_views = itopic.find_all("div", {"class": "flex items-center"})[0].find("p").text
        views.append(cleanString(num_views))
        # Finding the number of Views
        tview = columns[4].text
        views.append(cleanString(tview))
        # Finding the number of replies
        num_replies = itopic.find_all("div", {"class": "flex items-center"})[1].find("p").text
        posts.append(cleanString(num_replies))
        # If no information about when the topic was added, just assign "-1" to the variable
        addDate.append("-1")
    return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate)
        date_time_concatenated = itopic.find("p", {"class": "text-sm text-zinc-400 italic"}).text
        date_time_cleaned = date_time_concatenated.replace(username_not_cleaned, "")
        # creating the datetime object
        date_time_array = date_time_cleaned[3:]
        datetime_append = datetime.strptime(date_time_array, "%Y-%m-%d %H:%M:%S GMT")
        addDate.append(datetime_append)
    # print(forum)
    # print(nm)
    # print(board)
    # print(author)
    # print(topic)
    # print(views)
    # print(href)
    # print(addDate)
    # print(len(author))
    # print(len(topic))
    # print(len(views))
    # print(len(href))
    # print(len(addDate))
    return organizeTopics(
        forum=forum,
        nm=nm,
        board=board,
        author=author,
        topic=topic,
        views=views,
        posts=posts,
        href=href,
        addDate=addDate
    )
 def libre_links_parser(soup):
    # Returning all links that should be visited by the Crawler