Browse Source

last change before REU ended

main
Khoi 1 year ago
parent
commit
9a64698a9c
3 changed files with 57 additions and 31 deletions
  1. +13
    -13
      Forums/HiddenAnswers/crawler_selenium.py
  2. +39
    -18
      Forums/HiddenAnswers/parser.py
  3. +5
    -0
      Forums/Initialization/prepare_parser.py

+ 13
- 13
Forums/HiddenAnswers/crawler_selenium.py View File

@ -30,19 +30,19 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion
# Opens Tor Browser, crawls the website
def startCrawling():
opentor()
# forumName = getForumName()
driver: webdriver.Firefox = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# new_parse(forumName, baseURL, False)
# opentor()
forumName = getForumName()
# driver: webdriver.Firefox = getAccess()
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
new_parse(forumName, baseURL, False)
# Opens Tor Browser


+ 39
- 18
Forums/HiddenAnswers/parser.py View File

@ -38,14 +38,20 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S")
addDate.append(datetime_obj)
question_user_status = question.find("span", {"class": "qa-q-view-who-title"}).text
status.append(cleanString(question_user_status.strip()))
try:
question_user_status = question.find("span", {"class": "qa-q-view-who-title"}).text
status.append(cleanString(question_user_status.strip()))
except AttributeError:
status.append("-1")
question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}).text
# Convert karma to pure numerical string
if question_user_karma.find("k") > -1:
question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000)
reputation.append(cleanString(question_user_karma.strip()))
try:
question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}).text
# Convert karma to pure numerical string
if question_user_karma.find("k") > -1:
question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000)
reputation.append(cleanString(question_user_karma.strip()))
except AttributeError:
reputation.append("-1")
question_content = question.find("div", {"class": "qa-q-view-content qa-post-content"}).text
post.append(cleanString(question_content.strip()))
@ -71,14 +77,20 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
post_data = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).text
post.append(cleanString(post_data.strip()))
user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}).text
status.append(cleanString(user_reputations.strip()))
try:
user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}).text
status.append(cleanString(user_reputations.strip()))
except AttributeError:
status.append("-1")
karma = replies.find("span", {"class": "qa-a-item-who-points-data"}).text
# Convert karma to pure numerical string
if karma.find("k") > -1:
karma = str(float(karma.replace("k", "")) * 1000)
reputation.append(cleanString(karma.strip()))
try:
karma = replies.find("span", {"class": "qa-a-item-who-points-data"}).text
# Convert karma to pure numerical string
if karma.find("k") > -1:
karma = str(float(karma.replace("k", "")) * 1000)
reputation.append(cleanString(karma.strip()))
except AttributeError:
reputation.append("-1")
feedback.append("-1")
sign.append("-1")
@ -114,7 +126,7 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text
topic.append(cleanString(topic_of_query.strip()))
author = queries.find("span", {"class": "qa-q-item-who-data"}).find("a").text
author = queries.find("span", {"class": "qa-q-item-who-data"}).text
user.append(cleanString(author.strip()))
num_answers = queries.find("span", {"class": "qa-a-count-data"}).text
@ -124,10 +136,19 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
date_posted = queries.find("span", {"class": "qa-q-item-when-data"}).text
if date_posted.find("day") > 0:
datetime_obj = datetime.now() - timedelta(days=1)
if date_posted.find("minute") > 0:
minutes_ago = date_posted.split(' ')[0]
datetime_obj = datetime.now() - timedelta(minutes=int(minutes_ago))
elif date_posted.find("day") > 0:
days_ago = date_posted.split(' ')[0]
datetime_obj = datetime.now() - timedelta(days=int(days_ago))
elif bool(re.search(r"\d{4}", date_posted)):
datetime_obj = datetime.strptime(date_posted, "%b %d, %Y")
else:
datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
datetime_obj = datetime.strptime(f"{date_posted}, {date.today().year}", "%b %d, %Y")
addDate.append(datetime_obj)
#this link will be cleaned


+ 5
- 0
Forums/Initialization/prepare_parser.py View File

@ -12,6 +12,7 @@ from Forums.OnniForums.parser import *
from Forums.Altenens.parser import *
from Forums.Procrax.parser import *
from Forums.Libre.parser import *
from Forums.HiddenAnswers.parser import *
from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -163,6 +164,8 @@ def new_parse(forum, url, createLog):
rmm = procrax_description_parser(soup)
elif forum == "Libre":
rmm = libre_description_parser(soup)
elif forum == "HiddenAnswers":
rmm = HiddenAnswers_description_parser(soup)
# key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
key = u"Url:" + os.path.basename(line2).replace(".html", "")
@ -248,6 +251,8 @@ def new_parse(forum, url, createLog):
rw = procrax_listing_parser(soup)
elif forum == "Libre":
rw = libre_listing_parser(soup)
elif forum == "HiddenAnswers":
rw = HiddenAnswers_listing_parser(soup)
except:


Loading…
Cancel
Save