|
@ -152,7 +152,7 @@ def bestcardingworld_description_parser(soup): |
|
|
|
|
|
|
|
|
# Populate the final variable (this should be a list with all fields scraped) |
|
|
# Populate the final variable (this should be a list with all fields scraped) |
|
|
|
|
|
|
|
|
row = (topic, post, user, addDate, feedback, status, reputation, sign, interest) |
|
|
|
|
|
|
|
|
row = (topic, user, status, reputation, interest, sign, post, feedback, addDate) |
|
|
|
|
|
|
|
|
# Sending the results |
|
|
# Sending the results |
|
|
|
|
|
|
|
@ -166,15 +166,17 @@ def bestcardingworld_description_parser(soup): |
|
|
#return: 'row' that contains a variety of lists that each hold info on the listing page |
|
|
#return: 'row' that contains a variety of lists that each hold info on the listing page |
|
|
def bestcardingworld_listing_parser(soup): |
|
|
def bestcardingworld_listing_parser(soup): |
|
|
|
|
|
|
|
|
nm = 0 # this variable should receive the number of topics |
|
|
|
|
|
topic = [] # 1 all topics |
|
|
|
|
|
board = "-1" # 2 board name (the previous level of the topic in the Forum categorization tree. |
|
|
|
|
|
|
|
|
nm = 0 # *this variable should receive the number of topics |
|
|
|
|
|
forum = "BestCardingWorld" # 0 *forum name |
|
|
|
|
|
board = "-1" # 1 *board name (the previous level of the topic in the Forum categorization tree. |
|
|
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) |
|
|
# For instance: Security/Malware/Tools to hack Facebook. The board here should be Malware) |
|
|
view = [] # 3 number of views of each topic |
|
|
|
|
|
post = [] # 4 number of posts of each topic |
|
|
|
|
|
user = [] # 5 all users of each topic |
|
|
|
|
|
addDate = [] # 6 when the topic was created (difficult to find) |
|
|
|
|
|
href = [] # 16 this variable should receive all cleaned urls (we will use this to do the marge between Listing and Description pages) |
|
|
|
|
|
|
|
|
author = [] # 2 *all authors of each topic |
|
|
|
|
|
topic = [] # 3 *all topics |
|
|
|
|
|
views = [] # 4 number of views of each topic |
|
|
|
|
|
posts = [] # 5 number of posts of each topic |
|
|
|
|
|
href = [] # 6 this variable should receive all cleaned urls (we will use this to do the marge between |
|
|
|
|
|
# Listing and Description pages) |
|
|
|
|
|
addDate = [] # 7 when the topic was created (difficult to find) |
|
|
|
|
|
|
|
|
# Finding the board (should be just one) |
|
|
# Finding the board (should be just one) |
|
|
|
|
|
|
|
@ -187,7 +189,12 @@ def bestcardingworld_listing_parser(soup): |
|
|
|
|
|
|
|
|
itopics = soup.find('ul', {"class": "topiclist topics"}).findAll('div',{"class": "list-inner"}) |
|
|
itopics = soup.find('ul', {"class": "topiclist topics"}).findAll('div',{"class": "list-inner"}) |
|
|
replies = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "posts"}) |
|
|
replies = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "posts"}) |
|
|
views = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "views"}) |
|
|
|
|
|
|
|
|
view = soup.find('ul', {"class": "topiclist topics"}).findAll('dd',{"class": "views"}) |
|
|
|
|
|
|
|
|
|
|
|
# Counting how many topics we have found so far |
|
|
|
|
|
|
|
|
|
|
|
nm = len(itopics) |
|
|
|
|
|
|
|
|
index = 0 |
|
|
index = 0 |
|
|
for itopic in itopics: |
|
|
for itopic in itopics: |
|
|
|
|
|
|
|
@ -213,10 +220,6 @@ def bestcardingworld_listing_parser(soup): |
|
|
topics = itopic.find('a', {"class": "topictitle"}).text |
|
|
topics = itopic.find('a', {"class": "topictitle"}).text |
|
|
topic.append(cleanString(topics)) |
|
|
topic.append(cleanString(topics)) |
|
|
|
|
|
|
|
|
# Counting how many topics we have found so far |
|
|
|
|
|
|
|
|
|
|
|
nm = len(topic) |
|
|
|
|
|
|
|
|
|
|
|
# Adding the url to the list of urls |
|
|
# Adding the url to the list of urls |
|
|
link = itopic.find('a', {"class": "topictitle"}).get('href') |
|
|
link = itopic.find('a', {"class": "topictitle"}).get('href') |
|
|
link = cleanLink(link) |
|
|
link = cleanLink(link) |
|
@ -224,18 +227,18 @@ def bestcardingworld_listing_parser(soup): |
|
|
|
|
|
|
|
|
# Finding the author of the topic |
|
|
# Finding the author of the topic |
|
|
ps = itopic.find('div', {"class":"responsive-hide"}).find('a', {"class": "username-coloured"}).text |
|
|
ps = itopic.find('div', {"class":"responsive-hide"}).find('a', {"class": "username-coloured"}).text |
|
|
author = ps.strip() |
|
|
|
|
|
user.append(cleanString(author)) |
|
|
|
|
|
|
|
|
user = ps.strip() |
|
|
|
|
|
author.append(cleanString(user)) |
|
|
|
|
|
|
|
|
# Finding the number of replies |
|
|
# Finding the number of replies |
|
|
posts = replies[index].text.split()[0] |
|
|
|
|
|
posts = posts.strip() |
|
|
|
|
|
post.append(cleanString(posts)) |
|
|
|
|
|
|
|
|
post = replies[index].text.split()[0] |
|
|
|
|
|
post = post.strip() |
|
|
|
|
|
posts.append(cleanString(post)) |
|
|
|
|
|
|
|
|
# Finding the number of Views |
|
|
# Finding the number of Views |
|
|
tview = views[index].text.split()[0] |
|
|
|
|
|
|
|
|
tview = view[index].text.split()[0] |
|
|
tview = tview.strip() |
|
|
tview = tview.strip() |
|
|
view.append(cleanString(tview)) |
|
|
|
|
|
|
|
|
views.append(cleanString(tview)) |
|
|
|
|
|
|
|
|
# If no information about when the topic was added, just assign "-1" to the variable |
|
|
# If no information about when the topic was added, just assign "-1" to the variable |
|
|
#CryptBB doesn't show when topic was first posted on listing page |
|
|
#CryptBB doesn't show when topic was first posted on listing page |
|
@ -245,10 +248,9 @@ def bestcardingworld_listing_parser(soup): |
|
|
addDate.append(date_time_obj) |
|
|
addDate.append(date_time_obj) |
|
|
#addDate.append("-1") |
|
|
#addDate.append("-1") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
index += 1 |
|
|
index += 1 |
|
|
return organizeTopics("BestCardingWorld", nm, topic, board, view, post, user, addDate, href) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return organizeTopics(forum, nm, board, author, topic, views, posts, href, addDate) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#called by the crawler to get description links on a listing page |
|
|
#called by the crawler to get description links on a listing page |
|
|