Browse Source

finished altenens parser

changed naming scheme of multiple pages of topics
main
westernmeadow 1 year ago
parent
commit
3a665039c6
5 changed files with 70 additions and 77 deletions
  1. +38
    -54
      Forums/Altenens/crawler_selenium.py
  2. +14
    -7
      Forums/Altenens/parser.py
  3. +2
    -2
      Forums/Initialization/forums_mining.py
  4. +14
    -12
      Forums/Initialization/prepare_parser.py
  5. +2
    -2
      MarketPlaces/Tor2door/crawler_selenium.py

+ 38
- 54
Forums/Altenens/crawler_selenium.py View File

@ -1,8 +1,7 @@
__author__ = 'Helium' __author__ = 'Helium'
''' '''
Altenens Forum Crawler (Selenium);
Untested due to CAPTCHAS and blocking the network
Altenens Forum Crawler (Selenium)
''' '''
from selenium import webdriver from selenium import webdriver
@ -31,18 +30,18 @@ baseURL = 'https://altenens.is/'
# Opens Tor Browser, crawls the website # Opens Tor Browser, crawls the website
def startCrawling(): def startCrawling():
opentor()
# opentor()
forumName = getForumName() forumName = getForumName()
driver = getAccess()
if driver != 'down':
try:
login(driver)
crawlForum(driver)
except Exception as e:
print(driver.current_url, e)
closetor(driver)
# driver = getAccess()
#
# if driver != 'down':
# try:
# login(driver)
# crawlForum(driver)
# except Exception as e:
# print(driver.current_url, e)
# closetor(driver)
#
new_parse(forumName, baseURL, False) new_parse(forumName, baseURL, False)
@ -73,12 +72,12 @@ def login(driver):
#Password here #Password here
passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox passwordBox.send_keys('johnnyTest@18')# sends string to passwordBox
input("Press ENTER when you complete the CAPTCHA and press login\n")
input("Press ENTER when CAPTCHA is completed\n")
# wait for listing page show up (This Xpath may need to change based on different seed url) # wait for listing page show up (This Xpath may need to change based on different seed url)
# wait for 50 sec until id = tab_content is found, then cont # wait for 50 sec until id = tab_content is found, then cont
# WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
# (By.XPATH, '/html/body/div[1]/div[4]/div/div/div[3]/div/div/div[4]/div/div/div[1]/div/div[1]')))
WebDriverWait(driver, 50).until(EC.visibility_of_element_located(
(By.XPATH, '/html/body/div[1]/div[1]/div/div/div/div[1]/a[1]')))
# Returns the name of the website # Returns the name of the website
@ -205,76 +204,64 @@ def crawlForum(driver):
print("Crawling the Altenens forum") print("Crawling the Altenens forum")
linksToCrawl = getInterestedLinks() linksToCrawl = getInterestedLinks()
visited = set(linksToCrawl)
initialTime = time.time()
i = 0 i = 0
count = 0
while i < len(linksToCrawl): while i < len(linksToCrawl):
link = linksToCrawl[i] link = linksToCrawl[i]
print('Crawling :', link) print('Crawling :', link)
try: try:
try:
driver.get(link)# open
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
has_next_page = True has_next_page = True
count = 0
#loop through the topics
while has_next_page: while has_next_page:
list = topicPages(html)# for multiple pages
for item in list:
#variable to check if there is a next page for the topic
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
topics = topicPages(html)
for topic in topics:
has_next_topic_page = True has_next_topic_page = True
counter = 1 counter = 1
page = topic
# check if there is a next page for the topics
while has_next_topic_page: while has_next_topic_page:
# try to access next page of th topic
itemURL = urlparse.urljoin(baseURL, str(item))
itemURL = urlparse.urljoin(baseURL, str(page))
try: try:
driver.get(itemURL) driver.get(itemURL)
except: except:
driver.refresh() driver.refresh()
savePage(driver.page_source, item)
savePage(driver.page_source, topic + f"page{counter}")
# if there is a next page then go and save....
# specific
try:
item = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
# comment out
if counter == 2:
break
if item == "":
try:
page = driver.find_element(By.LINK_TEXT, value='Next').get_attribute('href')
if page == "":
raise NoSuchElementException raise NoSuchElementException
has_next_topic_page = False
else:
counter += 1
counter += 1
except NoSuchElementException: except NoSuchElementException:
has_next_topic_page = False has_next_topic_page = False
#end of loop
for i in range(counter): for i in range(counter):
driver.back() driver.back()
# comment out # comment out
break break
# comment out # comment out
if count == 1: if count == 1:
count = 0
break break
try:# change depending on web page, #next page
try:
link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href') link = driver.find_element(by=By.LINK_TEXT, value='Next').get_attribute('href')
if link == "": if link == "":
raise NoSuchElementException raise NoSuchElementException
try:
driver.get(link)
except:
driver.refresh()
html = driver.page_source
savePage(html, link)
count += 1 count += 1
except NoSuchElementException: except NoSuchElementException:
@ -284,9 +271,6 @@ def crawlForum(driver):
print(link, e) print(link, e)
i += 1 i += 1
# finalTime = time.time()
# print finalTime - initialTime
input("Crawling Altenens forum done successfully. Press ENTER to continue\n") input("Crawling Altenens forum done successfully. Press ENTER to continue\n")


+ 14
- 7
Forums/Altenens/parser.py View File

@ -27,7 +27,8 @@ def altenens_description_parser(soup):
topic = soup.find("h1", {"class": "p-title-value"}).text topic = soup.find("h1", {"class": "p-title-value"}).text
topic = cleanString(topic.strip()) topic = cleanString(topic.strip())
iposts = soup.find('div', {"class": "block-body js-replyNewMessageContainer"}).find_all('article')
body = soup.find('div', {"class": "block-container lbContainer"})
iposts = body.find_all('article', {"class": "message message--post js-post js-inlineModContainer"})
for ipost in iposts: for ipost in iposts:
@ -54,12 +55,16 @@ def altenens_description_parser(soup):
sign.append(cleanString(signature)) sign.append(cleanString(signature))
inner = ipost.find('div', {"class": "bbWrapper"}).find(text=True, recursive=False) inner = ipost.find('div', {"class": "bbWrapper"}).find(text=True, recursive=False)
post.append(cleanString(inner.strip()))
if inner is not None:
inner = inner.strip()
else:
inner = "-1"
post.append(cleanString(inner))
feedback.append("-1") feedback.append("-1")
dt = ipost.find('time', {"class": "u-dt"})
date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
dt = ipost.find('time', {"class": "u-dt"}).get('datetime')
date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
addDate.append(date_time_obj) addDate.append(date_time_obj)
# Populate the final variable (this should be a list with all fields scraped) # Populate the final variable (this should be a list with all fields scraped)
@ -101,11 +106,11 @@ def altenens_listing_parser(soup):
link = itopic.find('a').get('href') link = itopic.find('a').get('href')
href.append(link) href.append(link)
user = itopic.find('div', {"class": "structItem-parts"}).find('a').text
user = itopic.find('ul', {"class": "structItem-parts"}).find('a').text
author.append(cleanString(user.strip())) author.append(cleanString(user.strip()))
dt = itopic.find('li', {"class": "structItem-startDate"}).get('datetime')
date_time_obj = datetime.strptime(dt, '%m-%d-%Y, %I:%M %p')
dt = itopic.find('time', {"class": "u-dt"}).get('datetime')
date_time_obj = datetime.strptime(dt, '%Y-%m-%dT%H:%M:%S%z')
addDate.append(date_time_obj) addDate.append(date_time_obj)
itopics = soup.find_all('div', {"class": "structItem-cell structItem-cell--meta"}) itopics = soup.find_all('div', {"class": "structItem-cell structItem-cell--meta"})
@ -113,10 +118,12 @@ def altenens_listing_parser(soup):
for itopic in itopics: for itopic in itopics:
nposts = itopic.find('dl', {"class": "pairs pairs--justified"}).text nposts = itopic.find('dl', {"class": "pairs pairs--justified"}).text
nposts = nposts.replace('Replies', '')
nposts = nposts.replace('K', '000') nposts = nposts.replace('K', '000')
posts.append(cleanString(nposts)) posts.append(cleanString(nposts))
nviews = itopic.find('dl', {"class": "pairs pairs--justified structItem-minor"}).text nviews = itopic.find('dl', {"class": "pairs pairs--justified structItem-minor"}).text
nviews = nviews.replace('Views', '')
nviews = nviews.replace('K', '000') nviews = nviews.replace('K', '000')
views.append(cleanString(nviews)) views.append(cleanString(nviews))


+ 2
- 2
Forums/Initialization/forums_mining.py View File

@ -113,12 +113,12 @@ if __name__ == '__main__':
crawlerAbyssForum() crawlerAbyssForum()
elif forum == "HiddenAnswers": elif forum == "HiddenAnswers":
crawlerHiddenAnswers() crawlerHiddenAnswers()
elif forum == "Altenens":
crawlerAltenens()
elif forum == 'Procrax': elif forum == 'Procrax':
crawlerProcraxForum() crawlerProcraxForum()
elif forum == 'Cardingleaks': elif forum == 'Cardingleaks':
crawlerCardingleaks() crawlerCardingleaks()
elif forum == 'Altenens':
crawlerAltenens()


+ 14
- 12
Forums/Initialization/prepare_parser.py View File

@ -8,6 +8,7 @@ from Forums.DB_Connection.db_connection import *
from Forums.BestCardingWorld.parser import * from Forums.BestCardingWorld.parser import *
from Forums.CryptBB.parser import * from Forums.CryptBB.parser import *
from Forums.OnniForums.parser import * from Forums.OnniForums.parser import *
from Forums.Altenens.parser import *
from Forums.Classifier.classify_product import predict from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
@ -151,27 +152,27 @@ def new_parse(forum, url, createLog):
rmm = cryptBB_description_parser(soup) rmm = cryptBB_description_parser(soup)
elif forum == "OnniForums": elif forum == "OnniForums":
rmm = onniForums_description_parser(soup) rmm = onniForums_description_parser(soup)
elif forum == "Altenens":
rmm = altenens_description_parser(soup)
# key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip() # key = u"Top:" + rmm[0].upper().strip() + u" User:" + rmm[2][0].upper().strip()
key = u"Url:" + os.path.basename(line2).replace(".html", "") key = u"Url:" + os.path.basename(line2).replace(".html", "")
# check if page or page exists at the end of a string followed by a series of numbers
#if yes add to other if no add to first page dictionary
# save descritions into record in memory
check = re.compile(r'(?<=Page|page)[0-9]*')
# check if "page1" exists at the end of a string
# if yes add to first page directory if no add to other
check = re.compile(r'page1$')
if check.search(key): if check.search(key):
# print(key, 'is an other page\n')
other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
else:
# print(key, 'is a first page\n') # print(key, 'is a first page\n')
detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]} detPage[key] = {'rmm': rmm, 'files': [os.path.basename(line2)]}
else:
# print(key, 'is an other page\n')
other[key] = {'rmm': rmm, 'filename': os.path.basename(line2)}
except: except:
nError += 1 nError += 1
print("There was a problem to parse the file " + line2 + " in the Description section!") print("There was a problem to parse the file " + line2 + " in the Description section!")
traceback.print_exc()
if createLog: if createLog:
logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n") logFile.write(str(nError) + ". There was a problem to parse the file " + line2 + " in the Description section.\n")
@ -195,7 +196,6 @@ def new_parse(forum, url, createLog):
other.pop(k) other.pop(k)
# Parsing the Listing Pages and put the tag's content into a list # Parsing the Listing Pages and put the tag's content into a list
for index, line1 in enumerate(lines): for index, line1 in enumerate(lines):
@ -231,6 +231,8 @@ def new_parse(forum, url, createLog):
rw = cryptBB_listing_parser(soup) rw = cryptBB_listing_parser(soup)
elif forum == "OnniForums": elif forum == "OnniForums":
rw = onniForums_listing_parser(soup) rw = onniForums_listing_parser(soup)
elif forum == "Altenens":
rw = altenens_listing_parser(soup)
except: except:
@ -255,8 +257,8 @@ def new_parse(forum, url, createLog):
# print(rec) # print(rec)
# key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip() # key = u"Top:" + rec[1].upper().strip() + u" User:" + rec[5].upper().strip()
key = u"Url:" + cleanLink(rec[6])
print(key)
key = u"Url:" + cleanLink(rec[6]) + "page1"
# print(key)
if key in detPage: if key in detPage:


+ 2
- 2
MarketPlaces/Tor2door/crawler_selenium.py View File

@ -30,7 +30,7 @@ baseURL = 'http://yzrrne3pveltulbavydr2kiashvlnysdwclwmklo6cyjuqpxi7ku4xqd.onion
# Opens Tor Browser, crawls the website # Opens Tor Browser, crawls the website
def startCrawling(): def startCrawling():
opentor() opentor()
# marketName = getMarketName()
# marketName = getMKTName()
driver = getAccess() driver = getAccess()
if driver != 'down': if driver != 'down':
@ -105,7 +105,7 @@ def login(driver):
# Returns the name of the website # Returns the name of the website
def getMarketName():
def getMKTName():
name = 'Tor2door' name = 'Tor2door'
return name return name


Loading…
Cancel
Save