From 6e1a1b6b2a35d40f0fb954826b88bfe407ccf493 Mon Sep 17 00:00:00 2001 From: Khoi Date: Thu, 20 Jul 2023 14:03:59 -0700 Subject: [PATCH] completed Libre forum (not tested yet) --- Forums/Initialization/forumsList.txt | 2 +- Forums/Initialization/prepare_parser.py | 5 + Forums/Libre/crawler_selenium.py | 18 +- Forums/Libre/parser.py | 264 ++++++++++-------------- 4 files changed, 127 insertions(+), 162 deletions(-) diff --git a/Forums/Initialization/forumsList.txt b/Forums/Initialization/forumsList.txt index 6f635a1..3010d1b 100644 --- a/Forums/Initialization/forumsList.txt +++ b/Forums/Initialization/forumsList.txt @@ -1 +1 @@ -Cardingleaks \ No newline at end of file +Libre \ No newline at end of file diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index 1c624b3..a1ef429 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -11,6 +11,7 @@ from Forums.CryptBB.parser import * from Forums.OnniForums.parser import * from Forums.Altenens.parser import * from Forums.Procrax.parser import * +from Forums.Libre.parser import * from Forums.Classifier.classify_product import predict # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi @@ -160,6 +161,8 @@ def new_parse(forum, url, createLog): rmm = altenens_description_parser(soup) elif forum == "Procrax": rmm = procrax_description_parser(soup) + elif forum == "Libre": + rmm = libre_description_parser(soup) # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip() key = u"Url:" + os.path.basename(line2).replace(".html", "") @@ -243,6 +246,8 @@ def new_parse(forum, url, createLog): rw = altenens_listing_parser(soup) elif forum == "Procrax": rw = procrax_listing_parser(soup) + elif forum == "Libre": + rw = libre_listing_parser(soup) except: diff --git a/Forums/Libre/crawler_selenium.py b/Forums/Libre/crawler_selenium.py index dfef8db..a2ba332 100644 --- a/Forums/Libre/crawler_selenium.py +++ b/Forums/Libre/crawler_selenium.py @@ -28,17 +28,17 @@ baseURL = 'http://libreeunomyly6ot7kspglmbd5cvlkogib6rozy43r2glatc6rmwauqd.onion # Opens Tor Browser, crawls the website def startCrawling(): - opentor() + # opentor() forumName = getForumName() - driver = getAccess() + # driver = getAccess() - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) new_parse(forumName, baseURL, False) diff --git a/Forums/Libre/parser.py b/Forums/Libre/parser.py index 093c671..1991d7a 100644 --- a/Forums/Libre/parser.py +++ b/Forums/Libre/parser.py @@ -12,150 +12,92 @@ from bs4 import BeautifulSoup # This is the method to parse the Description Pages (one page to each topic in the Listing Pages) -def cryptBB_description_parser(soup): - +def libre_description_parser(soup): # Fields to be parsed - topic = "-1" # 0 *topic name - user = [] # 1 *all users of each post - status = [] # 2 all user's authority in each post such as (adm, member, dangerous) - reputation = [] # 3 all user's karma in each post (usually found as a number) - interest = [] # 4 all user's interest in each post - sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) - post = [] # 6 all messages of each post - feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) - addDate = [] # 8 all dates of each post + topic = "-1" # 0 *topic name + user = [] # 1 *all users of each post + status = [] # 2 all user's authority in each post such as (adm, member, dangerous) + reputation = [] # 3 all user's karma in each post (usually found as a number) + interest = [] # 4 all user's interest in each post + sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) + post = [] # 6 all messages of each post + feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) + addDate = [] # 8 all dates of each post # Finding the topic (should be just one coming from the Listing Page) - li = soup.find("td", {"class": "thead"}).find('strong') - topic = li.text - topic = re.sub("\[\w*\]", '', topic) - - topic = topic.replace(",","") - topic = topic.replace("\n","") - topic = cleanString(topic.strip()) + topic_found = soup.find("a", {"class": "link text-xl text-zinc-300"}).text + topic = cleanString(topic_found.strip()) # Finding the repeated tag that corresponds to the listing of posts # try: - posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all( - 'div', {"class": "post"}) + posts = soup.find_all("div", {"class": "flex items-stretch"}) # For each message (post), get all the fields we are interested to: for ipost in posts: - # Finding a first level of the HTML page - post_wrapper = ipost.find('span', {"class": "largetext"}) - # Finding the author (user) of the post - author = post_wrapper.text.strip() - user.append(cleanString(author)) # Remember to clean the problematic characters - - # Finding the status of the author - - smalltext = ipost.find('div', {"class": "post_author"}) - - ''' - # Testing here two possibilities to find this status and combine them - if ipost.find('div', {"class": "deleted_post_author"}): - status.append(-1) - interest.append(-1) - reputation.append(-1) - addDate.append(-1) - post.append("THIS POST HAS BEEN REMOVED!") - sign.append(-1) - feedback.append(-1) - continue - ''' - - # CryptBB does have membergroup and postgroup - - membergroup = smalltext.find('div', {"class": "profile-rank"}) - postgroup = smalltext.find('div', {"class": "postgroup"}) - if membergroup != None: - membergroup = membergroup.text.strip() - if postgroup != None: - postgroup = postgroup.text.strip() - membergroup = membergroup + " - " + postgroup - else: - if postgroup != None: - membergroup = postgroup.text.strip() - else: - membergroup = "-1" - status.append(cleanString(membergroup)) + user_name = ipost.find("a", {"class": "link"}).text + user_name_cleaned = user_name.split("/")[1] + user.append(cleanString(user_name_cleaned)) # Remember to clean the problematic characters + + status.append("-1") # Finding the interest of the author # CryptBB does not have blurb - blurb = smalltext.find('li', {"class": "blurb"}) - if blurb != None: - blurb = blurb.text.strip() - else: - blurb = "-1" - interest.append(cleanString(blurb)) + + interest.append("-1") # Finding the reputation of the user # CryptBB does have reputation - author_stats = smalltext.find('div', {"class": "author_statistics"}) - karma = author_stats.find('strong') - if karma != None: - karma = karma.text - karma = karma.replace("Community Rating: ", "") - karma = karma.replace("Karma: ", "") - karma = karma.strip() - else: - karma = "-1" - reputation.append(cleanString(karma)) + + karma = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text + karma_cleaned = karma.split(" ")[6] + reputation.append(cleanString(karma_cleaned.strip())) # Getting here another good tag to find the post date, post content and users' signature - postarea = ipost.find('div', {"class": "post_content"}) - - dt = postarea.find('span', {"class": "post_date"}).text - # dt = dt.strip().split() - dt = dt.strip() - day=date.today() - if "Yesterday" in dt: - yesterday = day - timedelta(days=1) - yesterday = yesterday.strftime('%m-%d-%Y') - stime = dt.replace('Yesterday,','').strip() - date_time_obj = yesterday+ ', '+stime - date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') - elif "hours ago" in dt: - day = day.strftime('%m-%d-%Y') - date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] - date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p') - else: - date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') - stime = date_time_obj.strftime('%b %d, %Y') - sdate = date_time_obj.strftime('%I:%M %p') - addDate.append(date_time_obj) + date_posted = ipost.find("p", {"class": "text-zinc-400 text-justify"}).text + date_time_cleaned = date_posted.replace(user_name, "")[3:-12] + datetime_append = datetime.strptime(date_time_cleaned, "%Y-%m-%d %H:%M:%S GMT") + addDate.append(datetime_append) # Finding the post - - inner = postarea.find('div', {"class": "post_body scaleimages"}) - inner = inner.text.strip() - post.append(cleanString(inner)) + user_post = ipost.find("div", {"class": "content-c"}).text + post.append(cleanString(user_post)) # Finding the user's signature # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) - signature = ipost.find('div', {"class": "signature scaleimages"}) - if signature != None: - signature = signature.text.strip() - # print(signature) - else: - signature = "-1" - sign.append(cleanString(signature)) + + sign.append("-1") # As no information about user's feedback was found, just assign "-1" to the variable feedback.append("-1") # Populate the final variable (this should be a list with all fields scraped) + # print(topic) + # print(user) + # print(status) + # print(reputation) + # print(interest) + # print(sign) + # print(post) + # print(feedback) + # print(addDate) + # print(len(user)) + # print(len(status)) + # print(len(reputation)) + # print(len(interest)) + # print(len(sign)) + # print(len(feedback)) + # print(len(addDate)) row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) @@ -163,76 +105,94 @@ def cryptBB_description_parser(soup): return row -# This is the method to parse the Listing Pages (one page with many posts) -def cryptBB_listing_parser(soup): +# This is the method to parse the Listing Pages (one page with many posts) - nm = 0 # *this variable should receive the number of topics - forum = "OnniForums" # 0 *forum name - board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - author = [] # 2 *all authors of each topic - topic = [] # 3 *all topics - views = [] # 4 number of views of each topic - posts = [] # 5 number of posts of each topic - href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between - # Listing and Description pages) - addDate = [] # 7 when the topic was created (difficult to find) +def libre_listing_parser(soup): + nm = 0 # *this variable should receive the number of topics + forum = "Libre" # 0 *forum name + board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + author = [] # 2 *all authors of each topic + topic = [] # 3 *all topics + views = [] # 4 number of views of each topic + posts = [] # 5 number of posts of each topic + href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between + # Listing and Description pages) + addDate = [] # 7 when the topic was created (difficult to find) # Finding the board (should be just one) - board = soup.find('span', {"class": "active"}).text + board = soup.find('div', {"class": "title"}).find("h1").text board = cleanString(board.strip()) # Finding the repeated tag that corresponds to the listing of topics - itopics = soup.find_all('tr',{"class": "inline_row"}) + itopics = soup.find("div", {"class", "space-y-2 mt-4"}).find_all('div', {"class": "flex box"}) + nm = 0 for itopic in itopics: - + nm += 1 # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them # to don't miss any topic # Adding the topic to the topic list - try: - topics = itopic.find('span', {"class": "subject_old"}).find('a').text - except: - topics = itopic.find('span', {"class": "subject_new"}).find('a').text - topics = re.sub("\[\w*\]", '', topics) - topic.append(cleanString(topics)) - - # Counting how many topics we have found so far - - nm = len(topic) + topic_string = itopic.find("a", {"class": "link text-xl text-zinc-300"}).text + cleaned_topic_string = cleanString(topic_string.strip()) + topic.append(cleaned_topic_string) # Adding the url to the list of urls - try: - link = itopic.find('span', {"class": "subject_old"}).find('a').get('href') - except: - link = itopic.find('span',{"class": "subject_new"}).find('a').get('href') - href.append(link) + link_to_clean = itopic.find("a", {"class": "link text-xl text-zinc-300"}).get("href") - # Finding the author of the topic - ps = itopic.find('div', {"class":"author smalltext"}).find('a').text - user = ps.strip() - author.append(cleanString(user)) + href.append(link_to_clean) - # Finding the number of replies - columns = itopic.findChildren('td',recursive=False) - replies = columns[3].text + # Finding the author of the topic + username_not_cleaned = itopic.find('div', {"class": "flex-grow p-2 text-justify"}).find('a').text + username_cleaned = username_not_cleaned.split("/")[-1] + author.append(cleanString(username_cleaned)) - posts.append(cleanString(replies)) + # Finding the number of views + num_views = itopic.find_all("div", {"class": "flex items-center"})[0].find("p").text + views.append(cleanString(num_views)) - # Finding the number of Views - tview = columns[4].text - views.append(cleanString(tview)) + # Finding the number of replies + num_replies = itopic.find_all("div", {"class": "flex items-center"})[1].find("p").text + posts.append(cleanString(num_replies)) # If no information about when the topic was added, just assign "-1" to the variable - addDate.append("-1") - - return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) - + date_time_concatenated = itopic.find("p", {"class": "text-sm text-zinc-400 italic"}).text + date_time_cleaned = date_time_concatenated.replace(username_not_cleaned, "") + # creating the datetime object + date_time_array = date_time_cleaned[3:] + datetime_append = datetime.strptime(date_time_array, "%Y-%m-%d %H:%M:%S GMT") + addDate.append(datetime_append) + + # print(forum) + # print(nm) + # print(board) + # print(author) + # print(topic) + # print(views) + # print(href) + # print(addDate) + # print(len(author)) + # print(len(topic)) + # print(len(views)) + # print(len(href)) + # print(len(addDate)) + + return organizeTopics( + forum=forum, + nm=nm, + board=board, + author=author, + topic=topic, + views=views, + posts=posts, + href=href, + addDate=addDate + ) def libre_links_parser(soup): # Returning all links that should be visited by the Crawler