diff --git a/Forums/Altenens/parser.py b/Forums/Altenens/parser.py index 48649d7..8a3c3e7 100644 --- a/Forums/Altenens/parser.py +++ b/Forums/Altenens/parser.py @@ -1,4 +1,4 @@ -__author__ = 'Helium' +__author__ = 'DarkWeb' # Here, we are importing the auxiliary functions to clean or convert data from Forums.Utilities.utilities import * @@ -11,162 +11,56 @@ from bs4 import BeautifulSoup # This is the method to parse the Description Pages (one page to each topic in the Listing Pages) -def cryptBB_description_parser(soup): - # Fields to be parsed +def altenens_description_parser(soup): - topic = "-1" # topic name - user = [] # all users of each post - addDate = [] # all dated of each post - feedback = [] # all feedbacks of each vendor (this was found in just one Forum and with a number format) - status = [] # all user's authority in each post such as (adm, member, dangerous) - reputation = [] # all user's karma in each post (usually found as a number) - sign = [] # all user's signature in each post (usually a standard message after the content of the post) - post = [] # all messages of each post - interest = [] # all user's interest in each post + topic = "-1" # 0 *topic name + user = [] # 1 *all users of each post + status = [] # 2 all user's authority in each post such as (adm, member, dangerous) + reputation = [] # 3 all user's karma in each post (usually found as a number) + interest = [] # 4 all user's interest in each post + sign = [] # 5 all user's signature in each post (usually a standard message after the content of the post) + post = [] # 6 all messages of each post + feedback = [] # 7 all feedbacks of each vendor (this was found in just one Forum and with a number format) + addDate = [] # 8 all dates of each post - # Finding the topic (should be just one coming from the Listing Page) - - li = soup.find("td", {"class": "thead"}).find('strong') - topic = li.text - topic = re.sub("\[\w*\]", '', topic) - - topic = topic.replace(",","") - topic = topic.replace("\n","") + topic = soup.find("h1", {"class": "p-title-value"}).text topic = cleanString(topic.strip()) - # Finding the repeated tag that corresponds to the listing of posts - - # try: - posts = soup.find('table', {"class": "tborder tfixed clear"}).find('td', {"id": "posts_container"}).find_all( - 'div', {"class": "post"}) - - # For each message (post), get all the fields we are interested to: - - for ipost in posts: - - # Finding a first level of the HTML page - - post_wrapper = ipost.find('span', {"class": "largetext"}) + iposts = soup.find('div', {"class": "block-body js-replyNewMessageContainer"}).find_all('article') - # Finding the author (user) of the post + for ipost in iposts: - author = post_wrapper.text.strip() - user.append(cleanString(author)) # Remember to clean the problematic characters + author = ipost.find('h4', {"class": "message-name"}).text + user.append(cleanString(author.strip())) - # Finding the status of the author + stat = ipost.find('h5', {"class": "userTitle message-userTitle"}).text + status.append(cleanString(stat.strip())) - smalltext = ipost.find('div', {"class": "post_author"}) - - ''' - # Testing here two possibilities to find this status and combine them - if ipost.find('div', {"class": "deleted_post_author"}): - status.append(-1) - interest.append(-1) - reputation.append(-1) - addDate.append(-1) - post.append("THIS POST HAS BEEN REMOVED!") - sign.append(-1) - feedback.append(-1) - continue - ''' - - # CryptBB does have membergroup and postgroup - - membergroup = smalltext.find('div', {"class": "profile-rank"}) - postgroup = smalltext.find('div', {"class": "postgroup"}) - if membergroup != None: - membergroup = membergroup.text.strip() - if postgroup != None: - postgroup = postgroup.text.strip() - membergroup = membergroup + " - " + postgroup - else: - if postgroup != None: - membergroup = postgroup.text.strip() - else: - membergroup = "-1" - status.append(cleanString(membergroup)) - - # Finding the interest of the author - # CryptBB does not have blurb - blurb = smalltext.find('li', {"class": "blurb"}) - if blurb != None: - blurb = blurb.text.strip() + bar = ipost.find('div', {"class": "xtr-progress-bar"}) + if bar is not None: + rep = bar.find('p').get('data-value') else: - blurb = "-1" - interest.append(cleanString(blurb)) - - # Finding the reputation of the user - # CryptBB does have reputation - author_stats = smalltext.find('div', {"class": "author_statistics"}) - karma = author_stats.find('strong') - if karma != None: - karma = karma.text - karma = karma.replace("Community Rating: ", "") - karma = karma.replace("Karma: ", "") - karma = karma.strip() - else: - karma = "-1" - reputation.append(cleanString(karma)) - - # Getting here another good tag to find the post date, post content and users' signature - - postarea = ipost.find('div', {"class": "post_content"}) - - dt = postarea.find('span', {"class": "post_date"}).text - # dt = dt.strip().split() - dt = dt.strip() - day=date.today() - if "Yesterday" in dt: - yesterday = day - timedelta(days=1) - yesterday = yesterday.strftime('%m-%d-%Y') - stime = dt.replace('Yesterday,','').strip() - date_time_obj = yesterday+ ', '+stime - date_time_obj = datetime.strptime(date_time_obj,'%m-%d-%Y, %I:%M %p') - elif "hours ago" in dt: - day = day.strftime('%m-%d-%Y') - date_time_obj = postarea.find('span', {"class": "post_date"}).find('span')['title'] - date_time_obj = datetime.strptime(date_time_obj, '%m-%d-%Y, %I:%M %p') - else: - date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') - stime = date_time_obj.strftime('%b %d, %Y') - sdate = date_time_obj.strftime('%I:%M %p') - addDate.append(date_time_obj) + rep = "-1" + reputation.append(cleanString(rep)) - # Finding the post + interest.append("-1") - inner = postarea.find('div', {"class": "post_body scaleimages"}) - inner = inner.text.strip() - post.append(cleanString(inner)) - - # Finding the user's signature - - # signature = ipost.find('div', {"class": "post_wrapper"}).find('div', {"class": "moderatorbar"}).find('div', {"class": "signature"}) - signature = ipost.find('div', {"class": "signature scaleimages"}) - if signature != None: + signature = ipost.find('aside', {"class": "message-signature"}) + if signature is not None: signature = signature.text.strip() - # print(signature) else: signature = "-1" sign.append(cleanString(signature)) - # As no information about user's feedback was found, just assign "-1" to the variable + inner = ipost.find('div', {"class": "bbWrapper"}).find(text=True, recursive=False) + post.append(cleanString(inner.strip())) feedback.append("-1") - ''' - except: - if soup.find('td', {"class": "trow1"}).text == " You do not have permission to access this page. ": - user.append("-1") - status.append(-1) - interest.append(-1) - reputation.append(-1) - addDate.append(-1) - post.append("NO ACCESS TO THIS PAGE!") - sign.append(-1) - feedback.append(-1) - ''' - + dt = ipost.find('time', {"class": "u-dt"}) + date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') + addDate.append(date_time_obj) # Populate the final variable (this should be a list with all fields scraped) @@ -178,74 +72,55 @@ def cryptBB_description_parser(soup): # This is the method to parse the Listing Pages (one page with many posts) -def cryptBB_listing_parser(soup): - - board = "-1" # board name (the previous level of the topic in the Forum categorization tree. - # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) - - nm = 0 # this variable should receive the number of topics - topic = [] # all topics - author = [] # all authors of each topic - views = [] # number of views of each topic - posts = [] # number of posts of each topic - addDate = [] # when the topic was created (difficult to find) - href = [] # this variable should receive all cleaned urls (we will use this to do the marge between - # Listing and Description pages) - - # Finding the board (should be just one) - - board = soup.find('span', {"class": "active"}).text +def altenens_listing_parser(soup): + + nm = 0 # *this variable should receive the number of topics + forum = "Altenens" # 0 *forum name + board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. + # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) + author = [] # 2 *all authors of each topic + topic = [] # 3 *all topics + views = [] # 4 number of views of each topic + posts = [] # 5 number of posts of each topic + href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between + # Listing and Description pages) + addDate = [] # 7 when the topic was created (difficult to find) + + board = soup.find('h1', {"class": "p-title-value"}).text board = cleanString(board.strip()) - # Finding the repeated tag that corresponds to the listing of topics + itopics = soup.find_all('div', {"class": "structItem-cell structItem-cell--main"}) - itopics = soup.find_all('tr',{"class": "inline_row"}) + nm = len(itopics) for itopic in itopics: - # For each topic found, the structure to get the rest of the information can be of two types. Testing all of them - # to don't miss any topic - - # Adding the topic to the topic list - try: - topics = itopic.find('span', {"class": "subject_old"}).find('a').text - except: - topics = itopic.find('span', {"class": "subject_new"}).find('a').text - topics = re.sub("\[\w*\]", '', topics) - topic.append(cleanString(topics)) - - # Counting how many topics we have found so far + topics = itopic.find('div', {"class": "structItem-title"}).text + topic.append(cleanString(topics.strip())) - nm = len(topic) - - # Adding the url to the list of urls - try: - link = itopic.find('span', {"class": "subject_old"}).find('a').get('href') - except: - link = itopic.find('span',{"class": "subject_new"}).find('a').get('href') - link = cleanLink(link) + link = itopic.find('a').get('href') href.append(link) - # Finding the author of the topic - ps = itopic.find('div', {"class":"author smalltext"}).find('a').text - user = ps.strip() - author.append(cleanString(user)) + user = itopic.find('div', {"class": "structItem-parts"}).find('a').text + author.append(cleanString(user.strip())) - # Finding the number of replies - columns = itopic.findChildren('td',recursive=False) - replies = columns[3].text + dt = itopic.find('li', {"class": "structItem-startDate"}).get('datetime') + date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') + addDate.append(date_time_obj) - posts.append(cleanString(replies)) + itopics = soup.find_all('div', {"class": "structItem-cell structItem-cell--meta"}) - # Finding the number of Views - tview = columns[4].text - views.append(cleanString(tview)) + for itopic in itopics: - # If no information about when the topic was added, just assign "-1" to the variable + nposts = itopic.find('dl', {"class": "pairs pairs--justified"}).text + nposts = nposts.replace('K', '000') + posts.append(cleanString(nposts)) - addDate.append("-1") + nviews = itopic.find('dl', {"class": "pairs pairs--justified structItem-minor"}).text + nviews = nviews.replace('K', '000') + views.append(cleanString(nviews)) - return organizeTopics("CryptBB", nm, topic, board, author, views, posts, href, addDate) + return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) def altenens_links_parser(soup): diff --git a/Forums/CryptBB/parser.py b/Forums/CryptBB/parser.py index 1265299..7fbd56d 100644 --- a/Forums/CryptBB/parser.py +++ b/Forums/CryptBB/parser.py @@ -168,7 +168,7 @@ def cryptBB_description_parser(soup): def cryptBB_listing_parser(soup): nm = 0 # *this variable should receive the number of topics - forum = "OnniForums" # 0 *forum name + forum = "CryptBB" # 0 *forum name board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. # For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) author = [] # 2 *all authors of each topic