diff --git a/Forums/Altenens/crawler_selenium.py b/Forums/Altenens/crawler_selenium.py index b1c110a..736022c 100644 --- a/Forums/Altenens/crawler_selenium.py +++ b/Forums/Altenens/crawler_selenium.py @@ -1,8 +1,7 @@ __author__ = 'Helium' ''' -Altenens Forum Crawler (Selenium); -Untested due to CAPTCHAS and blocking the network +Altenens Forum Crawler (Selenium) ''' from selenium import webdriver @@ -31,18 +30,18 @@ baseURL = 'https://altenens.is/' # Opens Tor Browser, crawls the website def startCrawling(): - opentor() + # opentor() forumName = getForumName() - driver = getAccess() - - if driver != 'down': - try: - login(driver) - crawlForum(driver) - except Exception as e: - print(driver.current_url, e) - closetor(driver) - + # driver = getAccess() + # + # if driver != 'down': + # try: + # login(driver) + # crawlForum(driver) + # except Exception as e: + # print(driver.current_url, e) + # closetor(driver) + # new_parse(forumName, baseURL, False) @@ -73,12 +72,12 @@ def login(driver): #Password here passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox - input("Press ENTER when you complete the CAPTCHA and press login\n") + input("Press ENTER when CAPTCHA is completed\n") # wait for listing page show up (This Xpath may need to change based on different seed url) # wait for 50 sec until id = tab_content is found, then cont - # WebDriverWait(driver, 50).until(EC.visibility_of_element_located( - # (By.XPATH, '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div[4]/div/div/div[1]/div/div[1]'))) + WebDriverWait(driver, 50).until(EC.visibility_of_element_located( + (By.XPATH, '/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]'))) # Returns the name of the website @@ -205,76 +204,64 @@ def crawlForum(driver): print("Crawling the Altenens forum") linksToCrawl = getInterestedLinks() - visited = set(linksToCrawl) - initialTime = time.time() i = 0 - count = 0 while i < len(linksToCrawl): link = linksToCrawl[i] print('Crawling :', link) try: - try: - driver.get(link)# open - except: - driver.refresh() - html = driver.page_source - savePage(html, link) - has_next_page = True + count = 0 - #loop through the topics while has_next_page: - list = topicPages(html)# for multiple pages - for item in list: - #variable to check if there is a next page for the topic + try: + driver.get(link) + except: + driver.refresh() + html = driver.page_source + savePage(html, link) + + topics = topicPages(html) + for topic in topics: has_next_topic_page = True counter = 1 + page = topic - # check if there is a next page for the topics while has_next_topic_page: - # try to access next page of th topic - itemURL = urlparse.urljoin(baseURL, str(item)) + itemURL = urlparse.urljoin(baseURL, str(page)) try: driver.get(itemURL) except: driver.refresh() - savePage(driver.page_source, item) + savePage(driver.page_source, topic + f"page{counter}") - # if there is a next page then go and save.... - # specific - try: - item = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') + # comment out + if counter == 2: + break - if item == "": + try: + page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href') + if page == "": raise NoSuchElementException - has_next_topic_page = False - else: - counter += 1 + counter += 1 + except NoSuchElementException: has_next_topic_page = False - #end of loop for i in range(counter): driver.back() + # comment out break # comment out if count == 1: - count = 0 break - try:# change depending on web page, #next page + try: link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') if link == "": raise NoSuchElementException - try: - driver.get(link) - except: - driver.refresh() - html = driver.page_source - savePage(html, link) count += 1 except NoSuchElementException: @@ -284,9 +271,6 @@ def crawlForum(driver): print(link, e) i += 1 - # finalTime = time.time() - # print finalTime - initialTime - input("Crawling Altenens forum done successfully. Press ENTER to continue\n") diff --git a/Forums/Altenens/parser.py b/Forums/Altenens/parser.py index 5c5effd..71fb34d 100644 --- a/Forums/Altenens/parser.py +++ b/Forums/Altenens/parser.py @@ -27,7 +27,8 @@ def altenens_description_parser(soup): topic = soup.find("h1", {"class": "p-title-value"}).text topic = cleanString(topic.strip()) - iposts = soup.find('div', {"class": "block-body js-replyNewMessageContainer"}).find_all('article') + body = soup.find('div', {"class": "block-container lbContainer"}) + iposts = body.find_all('article', {"class": "message message--post js-post js-inlineModContainer"}) for ipost in iposts: @@ -54,12 +55,16 @@ def altenens_description_parser(soup): sign.append(cleanString(signature)) inner = ipost.find('div', {"class": "bbWrapper"}).find(text=True, recursive=False) - post.append(cleanString(inner.strip())) + if inner is not None: + inner = inner.strip() + else: + inner = "-1" + post.append(cleanString(inner)) feedback.append("-1") - dt = ipost.find('time', {"class": "u-dt"}) - date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') + dt = ipost.find('time', {"class": "u-dt"}).get('datetime') + date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z') addDate.append(date_time_obj) # Populate the final variable (this should be a list with all fields scraped) @@ -101,11 +106,11 @@ def altenens_listing_parser(soup): link = itopic.find('a').get('href') href.append(link) - user = itopic.find('div', {"class": "structItem-parts"}).find('a').text + user = itopic.find('ul', {"class": "structItem-parts"}).find('a').text author.append(cleanString(user.strip())) - dt = itopic.find('li', {"class": "structItem-startDate"}).get('datetime') - date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p') + dt = itopic.find('time', {"class": "u-dt"}).get('datetime') + date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z') addDate.append(date_time_obj) itopics = soup.find_all('div', {"class": "structItem-cell structItem-cell--meta"}) @@ -113,10 +118,12 @@ def altenens_listing_parser(soup): for itopic in itopics: nposts = itopic.find('dl', {"class": "pairs pairs--justified"}).text + nposts = nposts.replace('Replies', '') nposts = nposts.replace('K', '000') posts.append(cleanString(nposts)) nviews = itopic.find('dl', {"class": "pairs pairs--justified structItem-minor"}).text + nviews = nviews.replace('Views', '') nviews = nviews.replace('K', '000') views.append(cleanString(nviews)) diff --git a/Forums/Initialization/forums_mining.py b/Forums/Initialization/forums_mining.py index 1534169..53e27d4 100644 --- a/Forums/Initialization/forums_mining.py +++ b/Forums/Initialization/forums_mining.py @@ -113,12 +113,12 @@ if __name__ == '__main__': crawlerAbyssForum() elif forum == "HiddenAnswers": crawlerHiddenAnswers() - elif forum == "Altenens": - crawlerAltenens() elif forum == 'Procrax': crawlerProcraxForum() elif forum == 'Cardingleaks': crawlerCardingleaks() + elif forum == 'Altenens': + crawlerAltenens() diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index b062129..fe8be28 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -8,6 +8,7 @@ from Forums.DB_Connection.db_connection import * from Forums.BestCardingWorld.parser import * from Forums.CryptBB.parser import * from Forums.OnniForums.parser import * +from Forums.Altenens.parser import * from Forums.Classifier.classify_product import predict # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi @@ -151,27 +152,27 @@ def new_parse(forum, url, createLog): rmm = cryptBB_description_parser(soup) elif forum == "OnniForums": rmm = onniForums_description_parser(soup) - + elif forum == "Altenens": + rmm = altenens_description_parser(soup) # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip() key = u"Url:" + os.path.basename(line2).replace(".html", "") - # check if page or page exists at the end of a string followed by a series of numbers - #if yes add to other if no add to first page dictionary - # save descritions into record in memory - check = re.compile(r'(?<=Page|page)[0-9]*') + # check if "page1" exists at the end of a string + # if yes add to first page directory if no add to other + check = re.compile(r'page1$') if check.search(key): - # print(key, 'is an other page\n') - other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)} - else: # print(key, 'is a first page\n') detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]} - + else: + # print(key, 'is an other page\n') + other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)} except: nError += 1 print("There was a problem to parse the file " + line2 + " in the Description section!") + traceback.print_exc() if createLog: logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n") @@ -195,7 +196,6 @@ def new_parse(forum, url, createLog): other.pop(k) - # Parsing the Listing Pages and put the tag's content into a list for index, line1 in enumerate(lines): @@ -231,6 +231,8 @@ def new_parse(forum, url, createLog): rw = cryptBB_listing_parser(soup) elif forum == "OnniForums": rw = onniForums_listing_parser(soup) + elif forum == "Altenens": + rw = altenens_listing_parser(soup) except: @@ -255,8 +257,8 @@ def new_parse(forum, url, createLog): # print(rec) # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip() - key = u"Url:" + cleanLink(rec[6]) - print(key) + key = u"Url:" + cleanLink(rec[6]) + "page1" + # print(key) if key in detPage: diff --git a/MarketPlaces/Tor2door/crawler_selenium.py b/MarketPlaces/Tor2door/crawler_selenium.py index ea509cf..a299c71 100644 --- a/MarketPlaces/Tor2door/crawler_selenium.py +++ b/MarketPlaces/Tor2door/crawler_selenium.py @@ -30,7 +30,7 @@ baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion # Opens Tor Browser, crawls the website def startCrawling(): opentor() - # marketName = getMarketName() + # marketName = getMKTName() driver = getAccess() if driver != 'down': @@ -105,7 +105,7 @@ def login(driver): # Returns the name of the website -def getMarketName(): +def getMKTName(): name = 'Tor2door' return name