last change before REU ended

1 year ago · 9a64698a9c
--- a/Forums/HiddenAnswers/crawler_selenium.py
+++ b/Forums/HiddenAnswers/crawler_selenium.py
@ -30,19 +30,19 @@ baseURL = 'http://7eoz4h2nvw4zlr7gvlbutinqqpm546f5egswax54az6lt2u7e3t6d7yd.onion

 # Opens Tor Browser, crawls the website
 def startCrawling():
    opentor()
    # forumName = getForumName()
    driver: webdriver.Firefox = getAccess()

    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)

    # new_parse(forumName, baseURL, False)
    # opentor()
    forumName = getForumName()
    # driver: webdriver.Firefox = getAccess()

    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)

    new_parse(forumName, baseURL, False)


 # Opens Tor Browser
--- a/Forums/HiddenAnswers/parser.py
+++ b/Forums/HiddenAnswers/parser.py
@ -38,14 +38,20 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
    datetime_obj = datetime.strptime(datetime_string, "%Y-%m-%dT%H:%M:%S")
    addDate.append(datetime_obj)
    
    question_user_status = question.find("span", {"class": "qa-q-view-who-title"}).text
    status.append(cleanString(question_user_status.strip()))
    try:
        question_user_status = question.find("span", {"class": "qa-q-view-who-title"}).text
        status.append(cleanString(question_user_status.strip()))
    except AttributeError:
        status.append("-1")
    
    question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}).text
    # Convert karma to pure numerical string
    if question_user_karma.find("k") > -1:
        question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000)
    reputation.append(cleanString(question_user_karma.strip()))
    try:
        question_user_karma = question.find("span", {"class": "qa-q-view-who-points-data"}).text
        # Convert karma to pure numerical string
        if question_user_karma.find("k") > -1:
            question_user_karma = str(float(question_user_karma.replace("k", "")) * 1000)
        reputation.append(cleanString(question_user_karma.strip()))
    except AttributeError:
        reputation.append("-1")
    
    question_content = question.find("div", {"class": "qa-q-view-content qa-post-content"}).text
    post.append(cleanString(question_content.strip()))
@ -71,14 +77,20 @@ def HiddenAnswers_description_parser(soup: BeautifulSoup):
        post_data = replies.find("div", {"class": "qa-a-item-content qa-post-content"}).find("div",{"itemprop":"text"}).text
        post.append(cleanString(post_data.strip()))
        
        user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}).text
        status.append(cleanString(user_reputations.strip()))
        try:
            user_reputations = replies.find("span", {"class", "qa-a-item-who-title"}).text
            status.append(cleanString(user_reputations.strip()))
        except AttributeError:
            status.append("-1")
        
        karma = replies.find("span", {"class": "qa-a-item-who-points-data"}).text
        # Convert karma to pure numerical string
        if karma.find("k") > -1:
            karma = str(float(karma.replace("k", "")) * 1000)
        reputation.append(cleanString(karma.strip()))
        try:
            karma = replies.find("span", {"class": "qa-a-item-who-points-data"}).text
            # Convert karma to pure numerical string
            if karma.find("k") > -1:
                karma = str(float(karma.replace("k", "")) * 1000)
            reputation.append(cleanString(karma.strip()))
        except AttributeError:
            reputation.append("-1")
        
        feedback.append("-1")
        sign.append("-1")
@ -114,7 +126,7 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
        topic_of_query = queries.find("div", {"class": "qa-q-item-title"}).find("a").text
        topic.append(cleanString(topic_of_query.strip()))
        
        author = queries.find("span", {"class": "qa-q-item-who-data"}).find("a").text
        author = queries.find("span", {"class": "qa-q-item-who-data"}).text
        user.append(cleanString(author.strip()))
        
        num_answers = queries.find("span", {"class": "qa-a-count-data"}).text
@ -124,10 +136,19 @@ def HiddenAnswers_listing_parser(soup: BeautifulSoup):
        
        date_posted = queries.find("span", {"class": "qa-q-item-when-data"}).text
        
        if date_posted.find("day") > 0:
            datetime_obj = datetime.now() - timedelta(days=1)
        if date_posted.find("minute") > 0:
            minutes_ago = date_posted.split(' ')[0]
            datetime_obj = datetime.now() - timedelta(minutes=int(minutes_ago))
        
        elif date_posted.find("day") > 0:
            days_ago = date_posted.split(' ')[0]
            datetime_obj = datetime.now() - timedelta(days=int(days_ago))
            
        elif bool(re.search(r"\d{4}", date_posted)):
             datetime_obj = datetime.strptime(date_posted, "%b %d, %Y")
        
        else:
            datetime_obj = datetime.strptime(f"{date_posted} {date.today().year}", "%b %d %Y")
            datetime_obj = datetime.strptime(f"{date_posted}, {date.today().year}", "%b %d, %Y")
        addDate.append(datetime_obj)
        #this link will be cleaned
        
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@ -12,6 +12,7 @@ from Forums.OnniForums.parser import *
 from Forums.Altenens.parser import *
 from Forums.Procrax.parser import *
 from Forums.Libre.parser import *
 from Forums.HiddenAnswers.parser import *

 from Forums.Classifier.classify_product import predict
 # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -163,6 +164,8 @@ def new_parse(forum, url, createLog):
                rmm = procrax_description_parser(soup)
            elif forum == "Libre":
                rmm = libre_description_parser(soup)
            elif forum == "HiddenAnswers":
                rmm = HiddenAnswers_description_parser(soup)

            # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
            key = u"Url:" + os.path.basename(line2).replace(".html", "")
@ -248,6 +251,8 @@ def new_parse(forum, url, createLog):
                    rw = procrax_listing_parser(soup)
                elif forum == "Libre":
                    rw = libre_listing_parser(soup)
                elif forum == "HiddenAnswers":
                    rw = HiddenAnswers_listing_parser(soup)

            except: