finished altenens parser

changed naming scheme of multiple pages of topics
2 years ago · 3a665039c6
--- a/Forums/Altenens/crawler_selenium.py
+++ b/Forums/Altenens/crawler_selenium.py
@ -1,8 +1,7 @@
 __author__ = 'Helium'

 '''
 Altenens Forum Crawler (Selenium);
 Untested due to CAPTCHAS and blocking the network
 Altenens Forum Crawler (Selenium)
 '''

 from selenium import webdriver
@ -31,18 +30,18 @@ baseURL = 'https://altenens.is/'

 # Opens Tor Browser, crawls the website
 def startCrawling():
    opentor()
    # opentor()
    forumName = getForumName()
    driver = getAccess()

    if driver != 'down':
        try:
            login(driver)
            crawlForum(driver)
        except Exception as e:
            print(driver.current_url, e)
        closetor(driver)

    # driver = getAccess()
    #
    # if driver != 'down':
    #     try:
    #         login(driver)
    #         crawlForum(driver)
    #     except Exception as e:
    #         print(driver.current_url, e)
    #     closetor(driver)
    #
    new_parse(forumName, baseURL, False)


@ -73,12 +72,12 @@ def login(driver):
    #Password here
    passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox

    input("Press ENTER when you complete the CAPTCHA and press login\n")
    input("Press ENTER when CAPTCHA is completed\n")

    # wait for listing page show up (This Xpath may need to change based on different seed url)
    # wait for 50 sec until id = tab_content is found, then cont
    # WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
    #     (By.XPATH, '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div[4]/div/div/div[1]/div/div[1]')))
    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
        (By.XPATH, '/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]')))


 # Returns the name of the website
@ -205,76 +204,64 @@ def crawlForum(driver):
    print("Crawling the Altenens forum")

    linksToCrawl = getInterestedLinks()
    visited = set(linksToCrawl)
    initialTime = time.time()

    i = 0
    count = 0
    while i < len(linksToCrawl):
        link = linksToCrawl[i]
        print('Crawling :', link)
        try:
            try:
                driver.get(link)# open
            except:
                driver.refresh()
            html = driver.page_source
            savePage(html, link)

            has_next_page = True
            count = 0

            #loop through the topics
            while has_next_page:
                list = topicPages(html)# for multiple pages
                for item in list:
                    #variable to check if there is a next page for the topic
                try:
                    driver.get(link)
                except:
                    driver.refresh()
                html = driver.page_source
                savePage(html, link)

                topics = topicPages(html)
                for topic in topics:
                    has_next_topic_page = True
                    counter = 1
                    page = topic

                    # check if there is a next page for the topics
                    while has_next_topic_page:
                        # try to access next page of th topic
                        itemURL = urlparse.urljoin(baseURL, str(item))
                        itemURL = urlparse.urljoin(baseURL, str(page))
                        try:
                            driver.get(itemURL)
                        except:
                            driver.refresh()
                        savePage(driver.page_source, item)
                        savePage(driver.page_source, topic + f"page{counter}")

                        # if there is a next page then go and save....
                        # specific
                        try:
                            item = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
                        # comment out
                        if counter == 2:
                            break

                            if item == "":
                        try:
                            page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
                            if page == "":
                                raise NoSuchElementException
                                has_next_topic_page = False
                            else:
                                counter += 1
                            counter += 1

                        except NoSuchElementException:
                            has_next_topic_page = False

                    #end of loop
                    for i in range(counter):
                        driver.back()

                    # comment out
                    break

                # comment out
                if count == 1:
                   count = 0
                   break

                try:# change depending on web page, #next page
                try:
                    link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    try:
                        driver.get(link)
                    except:
                        driver.refresh()
                    html = driver.page_source
                    savePage(html, link)
                    count += 1

                except NoSuchElementException:
@ -284,9 +271,6 @@ def crawlForum(driver):
            print(link, e)
        i += 1

    # finalTime = time.time()
    # print finalTime - initialTime

    input("Crawling Altenens forum done successfully. Press ENTER to continue\n")


--- a/Forums/Altenens/parser.py
+++ b/Forums/Altenens/parser.py
@ -27,7 +27,8 @@ def altenens_description_parser(soup):
    topic = soup.find("h1", {"class": "p-title-value"}).text
    topic = cleanString(topic.strip())

    iposts = soup.find('div', {"class": "block-body js-replyNewMessageContainer"}).find_all('article')
    body = soup.find('div', {"class": "block-container lbContainer"})
    iposts = body.find_all('article', {"class": "message message--post js-post js-inlineModContainer"})

    for ipost in iposts:

@ -54,12 +55,16 @@ def altenens_description_parser(soup):
        sign.append(cleanString(signature))

        inner = ipost.find('div', {"class": "bbWrapper"}).find(text=True, recursive=False)
        post.append(cleanString(inner.strip()))
        if inner is not None:
            inner = inner.strip()
        else:
            inner = "-1"
        post.append(cleanString(inner))

        feedback.append("-1")

        dt = ipost.find('time', {"class": "u-dt"})
        date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
        dt = ipost.find('time', {"class": "u-dt"}).get('datetime')
        date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
        addDate.append(date_time_obj)

    # Populate the final variable (this should be a list with all fields scraped)
@ -101,11 +106,11 @@ def altenens_listing_parser(soup):
        link = itopic.find('a').get('href')
        href.append(link)

        user = itopic.find('div', {"class": "structItem-parts"}).find('a').text
        user = itopic.find('ul', {"class": "structItem-parts"}).find('a').text
        author.append(cleanString(user.strip()))

        dt = itopic.find('li', {"class": "structItem-startDate"}).get('datetime')
        date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
        dt = itopic.find('time', {"class": "u-dt"}).get('datetime')
        date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
        addDate.append(date_time_obj)

    itopics = soup.find_all('div', {"class": "structItem-cell structItem-cell--meta"})
@ -113,10 +118,12 @@ def altenens_listing_parser(soup):
    for itopic in itopics:

        nposts = itopic.find('dl', {"class": "pairs pairs--justified"}).text
        nposts = nposts.replace('Replies', '')
        nposts = nposts.replace('K', '000')
        posts.append(cleanString(nposts))

        nviews = itopic.find('dl', {"class": "pairs pairs--justified structItem-minor"}).text
        nviews = nviews.replace('Views', '')
        nviews = nviews.replace('K', '000')
        views.append(cleanString(nviews))

--- a/Forums/Initialization/forums_mining.py
+++ b/Forums/Initialization/forums_mining.py
@ -113,12 +113,12 @@ if __name__ == '__main__':
            crawlerAbyssForum()
        elif forum == "HiddenAnswers":
            crawlerHiddenAnswers()
        elif forum == "Altenens":
            crawlerAltenens()
        elif forum == 'Procrax':
            crawlerProcraxForum()
        elif forum == 'Cardingleaks':
            crawlerCardingleaks()
        elif forum == 'Altenens':
            crawlerAltenens()



--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@ -8,6 +8,7 @@ from Forums.DB_Connection.db_connection import *
 from Forums.BestCardingWorld.parser import *
 from Forums.CryptBB.parser import *
 from Forums.OnniForums.parser import *
 from Forums.Altenens.parser import *

 from Forums.Classifier.classify_product import predict
 # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -151,27 +152,27 @@ def new_parse(forum, url, createLog):
                rmm = cryptBB_description_parser(soup)
            elif forum == "OnniForums":
                rmm = onniForums_description_parser(soup)
            
            elif forum == "Altenens":
                rmm = altenens_description_parser(soup)

            # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
            key = u"Url:" + os.path.basename(line2).replace(".html", "")

            # check if page or page exists at the end of a string followed by a series of numbers
            #if yes add to other if no add to first page dictionary
            # save descritions into record in memory
            check = re.compile(r'(?<=Page|page)[0-9]*')
            # check if "page1" exists at the end of a string
            # if yes add to first page directory if no add to other
            check = re.compile(r'page1$')
            if check.search(key):
                # print(key, 'is an other page\n')
                other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
            else:
                # print(key, 'is a first page\n')
                detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]}

            else:
                # print(key, 'is an other page\n')
                other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}

        except:

            nError += 1
            print("There was a problem to parse the file " + line2 + " in the Description section!")
            traceback.print_exc()
            if createLog:
                logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n")

@ -195,7 +196,6 @@ def new_parse(forum, url, createLog):

                other.pop(k)


    # Parsing the Listing Pages and put the tag's content into a list
    for index, line1 in enumerate(lines):

@ -231,6 +231,8 @@ def new_parse(forum, url, createLog):
                    rw = cryptBB_listing_parser(soup)
                elif forum == "OnniForums":
                    rw = onniForums_listing_parser(soup)
                elif forum == "Altenens":
                    rw = altenens_listing_parser(soup)

            except:

@ -255,8 +257,8 @@ def new_parse(forum, url, createLog):
                    # print(rec)

                    # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip()
                    key = u"Url:" + cleanLink(rec[6])
                    print(key)
                    key = u"Url:" + cleanLink(rec[6]) + "page1"
                    # print(key)

                    if key in detPage:

--- a/MarketPlaces/Tor2door/crawler_selenium.py
+++ b/MarketPlaces/Tor2door/crawler_selenium.py
@ -30,7 +30,7 @@ baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion
 # Opens Tor Browser, crawls the website
 def startCrawling():
    opentor()
    # marketName = getMarketName()
    # marketName = getMKTName()
    driver = getAccess()
    
    if driver != 'down':
@ -105,7 +105,7 @@ def login(driver):


 # Returns the name of the website
 def getMarketName():
 def getMKTName():
    name = 'Tor2door'
    return name