finished altenens parser

changed naming scheme of multiple pages of topics
2 years ago · 3a665039c6
--- a/Forums/Altenens/crawler_selenium.py
+++ b/Forums/Altenens/crawler_selenium.py
@ -1,8 +1,7 @@
 __author__ = 'Helium'
 '''
-Altenens Forum Crawler (Selenium);
+Altenens Forum Crawler (Selenium)
 Untested due to CAPTCHAS and blocking the network
 '''
 from selenium import webdriver
@ -31,18 +30,18 @@ baseURL = 'https://altenens.is/'
 # Opens Tor Browser, crawls the website
 def startCrawling():
-    opentor()
+    # opentor()
    forumName = getForumName()
-    driver = getAccess()
+    # driver = getAccess()
-
+    #
-    if driver != 'down':
+    # if driver != 'down':
-        try:
+    #     try:
-            login(driver)
+    #         login(driver)
-            crawlForum(driver)
+    #         crawlForum(driver)
-        except Exception as e:
+    #     except Exception as e:
-            print(driver.current_url, e)
+    #         print(driver.current_url, e)
-        closetor(driver)
+    #     closetor(driver)
-
+    #
    new_parse(forumName, baseURL, False)
@ -73,12 +72,12 @@ def login(driver):
    #Password here
    passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox
-    input("Press ENTER when you complete the CAPTCHA and press login\n")
+    input("Press ENTER when CAPTCHA is completed\n")
    # wait for listing page show up (This Xpath may need to change based on different seed url)
    # wait for 50 sec until id = tab_content is found, then cont
-    # WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
+    WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
-    #     (By.XPATH, '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div[4]/div/div/div[1]/div/div[1]')))
+        (By.XPATH, '/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]')))
 # Returns the name of the website
@ -205,76 +204,64 @@ def crawlForum(driver):
    print("Crawling the Altenens forum")
    linksToCrawl = getInterestedLinks()
    visited = set(linksToCrawl)
    initialTime = time.time()
    i = 0
    count = 0
    while i < len(linksToCrawl):
        link = linksToCrawl[i]
        print('Crawling :', link)
        try:
            try:
                driver.get(link)# open
            except:
                driver.refresh()
            html = driver.page_source
            savePage(html, link)
            has_next_page = True
            count = 0
            #loop through the topics
            while has_next_page:
-                list = topicPages(html)# for multiple pages
+                try:
-                for item in list:
+                    driver.get(link)
-                    #variable to check if there is a next page for the topic
+                except:
                    driver.refresh()
                html = driver.page_source
                savePage(html, link)
                topics = topicPages(html)
                for topic in topics:
                    has_next_topic_page = True
                    counter = 1
                    page = topic
                    # check if there is a next page for the topics
                    while has_next_topic_page:
-                        # try to access next page of th topic
+                        itemURL = urlparse.urljoin(baseURL, str(page))
                        itemURL = urlparse.urljoin(baseURL, str(item))
                        try:
                            driver.get(itemURL)
                        except:
                            driver.refresh()
-                        savePage(driver.page_source, item)
+                        savePage(driver.page_source, topic + f"page{counter}")
-                        # if there is a next page then go and save....
+                        # comment out
-                        # specific
+                        if counter == 2:
-                        try:
+                            break
                            item = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
-                            if item == "":
+                        try:
                            page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
                            if page == "":
                                raise NoSuchElementException
-                                has_next_topic_page = False
+                            counter += 1
-                            else:
+
                                counter += 1
                        except NoSuchElementException:
                            has_next_topic_page = False
                    #end of loop
                    for i in range(counter):
                        driver.back()
                    # comment out
                    break
                # comment out
                if count == 1:
                   count = 0
                   break
-                try:# change depending on web page, #next page
+                try:
                    link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
                    if link == "":
                        raise NoSuchElementException
                    try:
                        driver.get(link)
                    except:
                        driver.refresh()
                    html = driver.page_source
                    savePage(html, link)
                    count += 1
                except NoSuchElementException:
@ -284,9 +271,6 @@ def crawlForum(driver):
            print(link, e)
        i += 1
    # finalTime = time.time()
    # print finalTime - initialTime
    input("Crawling Altenens forum done successfully. Press ENTER to continue\n")
--- a/Forums/Altenens/parser.py
+++ b/Forums/Altenens/parser.py
@ -27,7 +27,8 @@ def altenens_description_parser(soup):
    topic = soup.find("h1", {"class": "p-title-value"}).text
    topic = cleanString(topic.strip())
-    iposts = soup.find('div', {"class": "block-body js-replyNewMessageContainer"}).find_all('article')
+    body = soup.find('div', {"class": "block-container lbContainer"})
    iposts = body.find_all('article', {"class": "message message--post js-post js-inlineModContainer"})
    for ipost in iposts:
@ -54,12 +55,16 @@ def altenens_description_parser(soup):
        sign.append(cleanString(signature))
        inner = ipost.find('div', {"class": "bbWrapper"}).find(text=True, recursive=False)
-        post.append(cleanString(inner.strip()))
+        if inner is not None:
            inner = inner.strip()
        else:
            inner = "-1"
        post.append(cleanString(inner))
        feedback.append("-1")
-        dt = ipost.find('time', {"class": "u-dt"})
+        dt = ipost.find('time', {"class": "u-dt"}).get('datetime')
-        date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
+        date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
        addDate.append(date_time_obj)
    # Populate the final variable (this should be a list with all fields scraped)
@ -101,11 +106,11 @@ def altenens_listing_parser(soup):
        link = itopic.find('a').get('href')
        href.append(link)
-        user = itopic.find('div', {"class": "structItem-parts"}).find('a').text
+        user = itopic.find('ul', {"class": "structItem-parts"}).find('a').text
        author.append(cleanString(user.strip()))
-        dt = itopic.find('li', {"class": "structItem-startDate"}).get('datetime')
+        dt = itopic.find('time', {"class": "u-dt"}).get('datetime')
-        date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
+        date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
        addDate.append(date_time_obj)
    itopics = soup.find_all('div', {"class": "structItem-cell structItem-cell--meta"})
@ -113,10 +118,12 @@ def altenens_listing_parser(soup):
    for itopic in itopics:
        nposts = itopic.find('dl', {"class": "pairs pairs--justified"}).text
        nposts = nposts.replace('Replies', '')
        nposts = nposts.replace('K', '000')
        posts.append(cleanString(nposts))
        nviews = itopic.find('dl', {"class": "pairs pairs--justified structItem-minor"}).text
        nviews = nviews.replace('Views', '')
        nviews = nviews.replace('K', '000')
        views.append(cleanString(nviews))
--- a/Forums/Initialization/forums_mining.py
+++ b/Forums/Initialization/forums_mining.py
@ -113,12 +113,12 @@ if __name__ == '__main__':
            crawlerAbyssForum()
        elif forum == "HiddenAnswers":
            crawlerHiddenAnswers()
        elif forum == "Altenens":
            crawlerAltenens()
        elif forum == 'Procrax':
            crawlerProcraxForum()
        elif forum == 'Cardingleaks':
            crawlerCardingleaks()
        elif forum == 'Altenens':
            crawlerAltenens()
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@ -8,6 +8,7 @@ from Forums.DB_Connection.db_connection import *
 from Forums.BestCardingWorld.parser import *
 from Forums.CryptBB.parser import *
 from Forums.OnniForums.parser import *
 from Forums.Altenens.parser import *
 from Forums.Classifier.classify_product import predict
 # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -151,27 +152,27 @@ def new_parse(forum, url, createLog):
                rmm = cryptBB_description_parser(soup)
            elif forum == "OnniForums":
                rmm = onniForums_description_parser(soup)
-            
+            elif forum == "Altenens":
                rmm = altenens_description_parser(soup)
            # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
            key = u"Url:" + os.path.basename(line2).replace(".html", "")
-            # check if page or page exists at the end of a string followed by a series of numbers
+            # check if "page1" exists at the end of a string
-            #if yes add to other if no add to first page dictionary
+            # if yes add to first page directory if no add to other
-            # save descritions into record in memory
+            check = re.compile(r'page1$')
            check = re.compile(r'(?<=Page|page)[0-9]*')
            if check.search(key):
                # print(key, 'is an other page\n')
                other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
            else:
                # print(key, 'is a first page\n')
                detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]}
-
+            else:
                # print(key, 'is an other page\n')
                other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
        except:
            nError += 1
            print("There was a problem to parse the file " + line2 + " in the Description section!")
            traceback.print_exc()
            if createLog:
                logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n")
@ -195,7 +196,6 @@ def new_parse(forum, url, createLog):
                other.pop(k)
    # Parsing the Listing Pages and put the tag's content into a list
    for index, line1 in enumerate(lines):
@ -231,6 +231,8 @@ def new_parse(forum, url, createLog):
                    rw = cryptBB_listing_parser(soup)
                elif forum == "OnniForums":
                    rw = onniForums_listing_parser(soup)
                elif forum == "Altenens":
                    rw = altenens_listing_parser(soup)
            except:
@ -255,8 +257,8 @@ def new_parse(forum, url, createLog):
                    # print(rec)
                    # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip()
-                    key = u"Url:" + cleanLink(rec[6])
+                    key = u"Url:" + cleanLink(rec[6]) + "page1"
-                    print(key)
+                    # print(key)
                    if key in detPage:
--- a/MarketPlaces/Tor2door/crawler_selenium.py
+++ b/MarketPlaces/Tor2door/crawler_selenium.py
@ -30,7 +30,7 @@ baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion
 # Opens Tor Browser, crawls the website
 def startCrawling():
    opentor()
-    # marketName = getMarketName()
+    # marketName = getMKTName()
    driver = getAccess()
    if driver != 'down':
@ -105,7 +105,7 @@ def login(driver):
 # Returns the name of the website
-def getMarketName():
+def getMKTName():
    name = 'Tor2door'
    return name